In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

# Delete Columns Class
class columnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self

# KNN Imputer for specific columns Class
class KNNFeatureImputer:
    def __init__(self, n_neighbors=5, weights="uniform", metric="nan_euclidean", features_to_impute=None):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.metric = metric
        self.features_to_impute = features_to_impute
        self.imputer = None

    def fit(self, X, y=None):
        X = self._check_dataframe(X)
        self.features_to_impute_ = (
            X.columns if self.features_to_impute is None else self.features_to_impute
        )
        self.imputer = KNNImputer(
            n_neighbors=self.n_neighbors, weights=self.weights, metric=self.metric
        )
        # Fit only the selected columns
        self.imputer.fit(X[self.features_to_impute_])
        return self

    def transform(self, X):
        X = self._check_dataframe(X)
        X_copy = X.copy()
        X_copy[self.features_to_impute_] = self.imputer.transform(
            X[self.features_to_impute_]
        )
        return X_copy

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def _check_dataframe(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")
        return X

# FillNa Class
class FillNa(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col in self.columns:
            X[col] = X[col].fillna(X[col].mode()[0])
        return X

# pd.get_dummies Class
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, drop_first=False):
        self.columns = columns
        self.drop_first = drop_first

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X, columns=self.columns, drop_first=self.drop_first)

pipeline1 = Pipeline([
    ("columnDropper", columnDropperTransformer(['id', 'Name', 'City', 'Profession', 'Degree'])),
    ("KNNFeatureImputer", KNNFeatureImputer(features_to_impute=['Academic Pressure', 'CGPA', 'Study Satisfaction'])),
    ('FillNa', FillNa(columns=['Work Pressure', 'Job Satisfaction', 'Dietary Habits', 'Financial Stress'])),
    ('GetDummies', GetDummies(columns=[]))
])

In [3]:
import pandas as pd
df_test = pd.read_csv('/content/sample_data/test.csv')
df_final_test = pipeline1.fit_transform(df_test)

In [4]:
df_final_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Data columns (total 14 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 93800 non-null  object 
 1   Age                                    93800 non-null  float64
 2   Working Professional or Student        93800 non-null  object 
 3   Academic Pressure                      93800 non-null  float64
 4   Work Pressure                          75022 non-null  float64
 5   CGPA                                   93800 non-null  float64
 6   Study Satisfaction                     93800 non-null  float64
 7   Job Satisfaction                       75026 non-null  float64
 8   Sleep Duration                         93800 non-null  object 
 9   Dietary Habits                         93795 non-null  object 
 10  Have you ever had suicidal thoughts ?  93800 non-null  object 
 11  Wo

In [5]:
df_final_test = pipeline2.fit_transform(df_final_test)

In [6]:
df_final_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Data columns (total 14 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 93800 non-null  object 
 1   Age                                    93800 non-null  float64
 2   Working Professional or Student        93800 non-null  object 
 3   Academic Pressure                      93800 non-null  float64
 4   Work Pressure                          93800 non-null  float64
 5   CGPA                                   93800 non-null  float64
 6   Study Satisfaction                     93800 non-null  float64
 7   Job Satisfaction                       93800 non-null  float64
 8   Sleep Duration                         93800 non-null  object 
 9   Dietary Habits                         93800 non-null  object 
 10  Have you ever had suicidal thoughts ?  93800 non-null  object 
 11  Wo