In [1]:
import pandas as pd
df=pd.read_csv('data/train.csv')

In [2]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

FILLING NULL VALUES

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import shapiro,ks_2samp,norm
from scipy.spatial.distance import cdist
import math

class Preprocessor:

    # Attributes: df (Original DataFrame),  descriptor (Dataframe containing information about Null Values and Feature data type)
    # Methods : Called from outside: __inti__(df), fillNull()
    
    def __init__(self,df):
        self.df=df.copy()
        self.descriptor=self.generateDescriptor()
    
    def generateDescriptor(self):    # Generates descriptor df
        descriptor_df = pd.DataFrame(self.df.isnull().sum())
        descriptor_df[1] = round(descriptor_df[0]/self.df.shape[0],2)*100
        isNumerical=[]
        for column in self.df.columns:
            if self.df[column].dtype == 'int64' or self.df[column].dtype == 'float64':
                isNumerical.append(1)
            else:
                isNumerical.append(0)
        descriptor_df[2]=isNumerical
        return descriptor_df

    def checkDistribution(self,series):    # Labels feature distribution as Normal or Skewed
        if abs(series.skew())<0.5:
            return "normal"
        else:
            if series.shape[0]<5000:
                stat,p = shapiro(series)
            else:
                mu,sigma=series.mean(),series.std()
                stat,p = ks_2samp(series,norm.rvs(loc=mu,scale=sigma,size=len(series)))
            if p>0.05:
                return "normal"
            else:
                return "skewed"    

    def is_id_column(self,feature_series):    # Checks if feature is ID type
        value_range = feature_series.max() - feature_series.min()
        unique_count = feature_series.nunique()
        if unique_count == len(feature_series):
            if abs(value_range - unique_count) < 2:
                return True
        return False

    def knnImpute(self,df,descriptor_df,feature):    # KNN imputation based on categorical or numerical feature
        neighbour_features = [
            it for it in descriptor_df.index
            if descriptor_df[0][it] == 0
            and descriptor_df[2][it] == 1
            and not self.is_id_column(df[it])
            ]
    
        neighbour_df=df[neighbour_features]
        non_null=neighbour_df[df[feature].notna()]
        null=neighbour_df[df[feature].isna()]
        norm_min=neighbour_df.min()
        norm_range=neighbour_df.max()-neighbour_df.min()
        norm_range.replace(0,1e-9,inplace=True)
        non_null=(non_null-norm_min)/norm_range
        null=(null-norm_min)/norm_range
        distances = pd.DataFrame(cdist(null,non_null,metric='euclidean'))
        k=math.ceil(math.sqrt(non_null.shape[0]))
        k_nearest_indices= pd.DataFrame(np.argsort(distances,axis=1)).iloc[:,:k]
        
        for enumerated_null_index,df_index in enumerate(null.index):
            enumerated_non_null_indices=k_nearest_indices.loc[enumerated_null_index]
            true_df_indices=non_null.iloc[enumerated_non_null_indices].index
            neighbour_values = df[feature].loc[true_df_indices]
            if descriptor_df[2][feature]==1 and self.df[feature].nunique()/len(self.df[feature])>0.05:
                df.loc[df_index, feature] = neighbour_values.mean()
            else:
                df.loc[df_index, feature] = neighbour_values.mode().iloc[0]
        
    def fillNull(self):
        for feature in self.descriptor.index:
            
            if self.descriptor[1][feature]>30:    # Feature has more than 30% null values
                self.df.drop(feature,axis=1,inplace=True)    # Remove feature
                self.descriptor = self.generateDescriptor()    # Remake descriptor df
                continue
                
            if self.descriptor[2][feature]==1:    # Numerical feature
                
                if self.descriptor[1][feature]>5:    # Null values between 5-30%
                    self.knnImpute(self.df,self.descriptor,feature)    # Use KNN
                    
                else:    # Null values between less than 5%
                    if self.df[feature].nunique()/len(self.df[feature])<0.05:    # Ordinal Feature (Discrete Finite numerical)
                        self.df.loc[:, feature] = self.df[feature].fillna(self.df[feature].mode().iloc[0]).copy()   # Fill with mode
                        
                    elif self.checkDistribution(self.df[feature]) == 'normal':    # Normal numerical fetaure
                        self.df.loc[:, feature] = self.df[feature].fillna(self.df[feature].mean()).copy()    # Fill with mean
                        
                    else:    # Skewed numerical feature
                        self.df.loc[:, feature] = self.df[feature].fillna(self.df[feature].median()).copy()    # Fill with median
                        
            else:    # Categorical feature
                if self.descriptor[1][feature]>5:    # Null values between 5-30%
                    self.knnImpute(self.df,self.descriptor,feature)    # Use KNN
                    
                else:    # Null values between less than 5%
                    self.df.loc[:, feature] = self.df[feature].fillna(self.df[feature].mode().iloc[0]).copy()   # Fill with mode

                    
        self.descriptor=self.generateDescriptor()    # Remake descriptor df
        return self

    def transform(self):
        return self.df

In [4]:
pre=Preprocessor(df)
df1=pre.fillNull().transform()
df1.isnull().sum()


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

FEATURE ENGINEERING

In [5]:
from dateutil import parser
import pandas as pd

class FeatureEngineering:
    def __init__(self, df):
        self.df = df.copy()

    @staticmethod
    def safe_parse_date(date_str):
        try:
            return parser.parse(date_str)
        except (ValueError, TypeError):
            return None

    def extract_datetime_features(self):
        datetime_cols = self.df.select_dtypes(include=['object']).columns.tolist()
        datetime_cols = [col for col in datetime_cols if self.df[col].str.contains(r'\d', na=False, regex=True).any()]
        
        new_features = {}

        for col in datetime_cols:
            self.df[col] = self.df[col].apply(lambda x: FeatureEngineering.safe_parse_date(x) if pd.notna(x) else None)
            self.df[col] = pd.to_datetime(self.df[col], errors='coerce')

            valid_rows = self.df[col].notna()
            if valid_rows.sum() < 0.3 * self.df.shape[0]:  # Keep only columns with enough valid dates
                continue

            new_features[f"{col}_year"] = self.df[col].dt.year
            new_features[f"{col}_month"] = self.df[col].dt.month
            new_features[f"{col}_day"] = self.df[col].dt.day
            new_features[f"{col}_weekday"] = self.df[col].dt.weekday
            new_features[f"{col}_hour"] = self.df[col].dt.hour
            self.df.drop(columns=[col], inplace=True)

        if new_features:
            self.df = pd.concat([self.df, pd.DataFrame(new_features, index=self.df.index)], axis=1)  # Efficient joining

        return self




    def extract_text_features(self):
        text_cols = self.df.select_dtypes(include=['object']).columns
        new_features = {}

        for col in text_cols:
            if self.df[col].isna().all():
                continue
            new_features[f"{col}_char_count"] = self.df[col].astype(str).apply(len)
            new_features[f"{col}_word_count"] = self.df[col].astype(str).apply(lambda x: len(x.split()))

        if new_features:
            self.df = self.df.assign(**new_features)
        
        return self
    
    def create_interaction_features(self):
        num_cols = self.df.select_dtypes(include=[np.number]).columns
        new_features = {}
    
        if len(num_cols) > 1:
            for i in range(len(num_cols)):
                for j in range(i + 1, len(num_cols)):
                    col1, col2 = num_cols[i], num_cols[j]
    
                    # Basic multiplicative interaction
                    new_features[f"{col1}_x_{col2}"] = self.df[col1].fillna(0) * self.df[col2].fillna(0)
                    
                    # Additive interaction
                    new_features[f"{col1}_plus_{col2}"] = self.df[col1].fillna(0) + self.df[col2].fillna(0)
                    
                    # Ratio interaction (avoid division by zero)
                    new_features[f"{col1}_div_{col2}"] = self.df[col1] / (self.df[col2] + 1e-9)
                    new_features[f"{col2}_div_{col1}"] = self.df[col2] / (self.df[col1] + 1e-9)
    
        if new_features:
            self.df = pd.concat([self.df, pd.DataFrame(new_features, index=self.df.index)], axis=1).copy()
        return self


    def create_statistical_features(self):
        num_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(num_cols) == 0:
            return self

        self.df["num_mean"] = self.df[num_cols].mean(axis=1)
        self.df["num_std"] = self.df[num_cols].std(axis=1)
        self.df["num_median"] = self.df[num_cols].median(axis=1)
        
        return self

    def encode_categorical_features(self):
        cat_cols = self.df.select_dtypes(include=['object']).columns
        new_features = {}

        for col in cat_cols:
            counts = self.df[col].value_counts().to_dict()
            freqs = self.df[col].map(self.df[col].value_counts(normalize=True))
            
            new_features[f"{col}_count"] = counts
            new_features[f"{col}_freq"] = freqs

        if new_features:
            self.df = self.df.assign(**new_features)
        
        return self

    def transform(self):
        self.df.dropna(axis=1, how='any', inplace=True)  # Drops all columns with any NaN values
        return self.df


    def automated_feature_engineering(self):
        return (
            self.extract_datetime_features()
            .extract_text_features()
            .create_interaction_features()
            .create_statistical_features()
            .encode_categorical_features()
            .transform()
        )


In [6]:
fe = FeatureEngineering(df1)
df2 = fe.automated_feature_engineering()

In [10]:
df2.shape

(891, 334)

TRAIN TEST SPLIT

In [12]:
class Split:
    def __init__(self, df):
        self.df = df.copy()

    def X_y_split(self, target_feature):
        if target_feature in self.df.columns:
            y = self.df[target_feature]
            X = self.df.drop(columns=[target_feature])
            return X, y
        else:
            print(f"'{target_feature}' is not a feature of the given dataset.")
            return None, None

    def train_test_split(self, X, y, test_size=0.2, random_state=42):
        np.random.seed(random_state)
        indices = np.random.permutation(len(X))

        test_count = int(len(X) * test_size)
        test_idx, train_idx = indices[:test_count], indices[test_count:]

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        return X_train, X_test, y_train, y_test


In [15]:
sp=Split(df2)
X,y=sp.X_y_split('Survived')

In [18]:
X.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Name_char_count,...,Embarked_char_count_x_Embarked_word_count,Embarked_char_count_plus_Embarked_word_count,Embarked_char_count_div_Embarked_word_count,Embarked_word_count_div_Embarked_char_count,num_mean,num_std,num_median,Name_freq,Sex_freq,Embarked_freq
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,23,...,1,2,1.0,1.0,420000000.0,2585450000.0,3.0,0.001122,0.647587,0.725028
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,51,...,1,2,1.0,1.0,557794800.0,5305538000.0,2.0,0.001122,0.352413,0.188552


In [19]:
y.head(2)

0    0
1    1
Name: Survived, dtype: int64

In [20]:
X_train,X_test,y_train,y_test=sp.train_test_split(X,y)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(713, 333)
(713,)
(178, 333)
(178,)


ENCODING

In [47]:
class SimpleEncoder:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train.copy()
        self.X_test = X_test.copy()
        self.y_train = y_train.copy()
        self.y_test = y_test.copy()
        self.encoding_type = self.generateEncodingType()

    def generateEncodingType(self): 
        categorical_columns = self.X_train.select_dtypes(include=['object']).columns.tolist()
        encoding_type = {}
        for column in categorical_columns:
            if self.X_train[column].nunique() < 10:
                encoding_type[column] = 'OHE'
            else:
                encoding_type[column] = 'frequency'
        return encoding_type

    def oneHotEncoding(self, features):
        self.X_train = pd.get_dummies(self.X_train, columns=features, drop_first=True)
        self.X_test = pd.get_dummies(self.X_test, columns=features, drop_first=True)
        self.X_test = self.X_test.reindex(columns=self.X_train.columns, fill_value=0)  # Align test with train

    def frequencyEncoding(self, features):
        for feature in features:
            freqs = self.X_train[feature].value_counts(normalize=True)
            self.X_train[feature + '_freq'] = self.X_train[feature].map(freqs)
            self.X_test[feature + '_freq'] = self.X_test[feature].map(freqs).fillna(0)
        self.X_train.drop(columns=features, inplace=True)
        self.X_test.drop(columns=features, inplace=True)

    def encodeInput(self):
        OHE = []
        freq = []
        for feature in self.encoding_type.keys():  
            if self.encoding_type[feature] == 'OHE':
                OHE.append(feature)
            else:
                freq.append(feature)

        self.oneHotEncoding(OHE)
        self.frequencyEncoding(freq)
        return self.X_train, self.X_test

    def encodeOutput(self):
        if self.y_train.dtype == 'object':
            unique_classes = self.y_train.unique()
            class_mapping = {cls: idx for idx, cls in enumerate(unique_classes)}
            self.y_train = self.y_train.map(class_mapping)
            self.y_test = self.y_test.map(lambda x: class_mapping.get(x, -1))  # -1 --> unseen classes 
        return self.y_train, self.y_test


class TargetEncoder:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train.copy()
        self.X_test = X_test.copy()
        self.y_train = y_train.copy()
        self.y_test = y_test.copy()
        self.target_features = self.X_train.select_dtypes(include=['object']).columns.tolist()
        self.encoding_map = {}

    def fit(self, features):
        df_train = self.X_train.copy()
        df_train["target"] = self.y_train  # Temporarily add y_train to X_train

        for feature in features:
            means = df_train.groupby(feature)["target"].mean()  # Now "target" exists in df_train
            self.encoding_map[feature] = means.to_dict()

    def transform(self, X, features):
        X_encoded = X.copy()
        for feature in features:
            X_encoded[feature + '_target'] = X_encoded[feature].map(self.encoding_map.get(feature, {})).fillna(self.y_train.mean())
        X_encoded.drop(columns=features, inplace=True)
        return X_encoded

    def encodeInput(self):
        self.fit(self.target_features)
        self.X_train = self.transform(self.X_train, self.target_features)
        self.X_test = self.transform(self.X_test, self.target_features)
        return self.X_train, self.X_test

    def encodeOutput(self):
        if self.y_train.dtype == 'object':
            unique_classes = self.y_train.unique()
            class_mapping = {cls: idx for idx, cls in enumerate(unique_classes)}
            self.y_train = self.y_train.map(class_mapping)
            self.y_test = self.y_test.map(lambda x: class_mapping.get(x, -1))  # Assign -1 to unseen classes 
        return self.y_train, self.y_test

In [48]:
se=SimpleEncoder(X_train,X_test,y_train,y_test)
Xtrain_se,Xtest_se=se.encodeInput()

In [49]:
te=TargetEncoder(X_train,X_test,y_train,y_test)
Xtrain_te,Xtest_te=te.encodeInput()

In [None]:
y_