In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [2]:
data_01 = pd.read_csv(f'mba_decision_dataset.csv')
data_v2 = data_01.copy()

data_v2.head()

Unnamed: 0,Person ID,Age,Gender,Undergraduate Major,Undergraduate GPA,Years of Work Experience,Current Job Title,Annual Salary (Before MBA),Has Management Experience,GRE/GMAT Score,Undergrad University Ranking,Entrepreneurial Interest,Networking Importance,MBA Funding Source,Desired Post-MBA Role,Expected Post-MBA Salary,Location Preference (Post-MBA),Reason for MBA,Online vs. On-Campus MBA,Decided to Pursue MBA?
0,1,27,Male,Arts,3.18,8,Entrepreneur,90624,No,688,185,7.9,7.6,Loan,Finance Manager,156165,International,Entrepreneurship,On-Campus,Yes
1,2,24,Male,Arts,3.03,4,Analyst,53576,Yes,791,405,3.8,4.1,Loan,Startup Founder,165612,International,Career Growth,Online,No
2,3,33,Female,Business,3.66,9,Engineer,79796,No,430,107,6.7,5.5,Scholarship,Consultant,122248,Domestic,Skill Enhancement,Online,No
3,4,31,Male,Engineering,2.46,1,Manager,105956,No,356,257,1.0,5.3,Loan,Consultant,123797,International,Entrepreneurship,On-Campus,No
4,5,28,Female,Business,2.75,9,Entrepreneur,96132,No,472,338,9.5,4.9,Loan,Consultant,197509,Domestic,Skill Enhancement,Online,Yes


In [3]:
data_v2.columns

Index(['Person ID', 'Age', 'Gender', 'Undergraduate Major',
       'Undergraduate GPA', 'Years of Work Experience', 'Current Job Title',
       'Annual Salary (Before MBA)', 'Has Management Experience',
       'GRE/GMAT Score', 'Undergrad University Ranking',
       'Entrepreneurial Interest', 'Networking Importance',
       'MBA Funding Source', 'Desired Post-MBA Role',
       'Expected Post-MBA Salary', 'Location Preference (Post-MBA)',
       'Reason for MBA', 'Online vs. On-Campus MBA', 'Decided to Pursue MBA?'],
      dtype='object')

In [12]:
# Pipeline for the Drop column 

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, column_to_drop):
        self.column_to_drop = column_to_drop

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return X.drop(columns = self.column_to_drop, errors = 'ignore')

Drop_pipeline = Pipeline([
    ('Drop_feature', ColumnDropper(column_to_drop = ['ID']))
])

Drop_pipeline

In [28]:
#Create a Pipeline for  feature scaling 

feature_names = data_v2[['Age', 'Undergraduate GPA', 'Years of Work Experience', 'Annual Salary (Before MBA)', 'GRE/GMAT Score',
                       'Undergrad University Ranking', 'Entrepreneurial Interest', 'Networking Importance']]

scaler = ColumnTransformer(
    transformers = [('scaler', StandardScaler(), feature_names)],
    remainder = 'passthrough' # Keep other columns unchanged
)

Scaler_pipeline = Pipeline([
    ('scaler', scaler)
])

Scaler_pipeline

In [26]:
# Create Pipeline for Features encoding 

# Features  to encoding 
label_encode_features = ['Gender', 'Has Management Experience', 'Location Preference (Post-MBA)',
                         'Online vs. On-Campus MBA', 'Decided to Pursue MBA?']  
onehot_encode_features = ['Undergraduate Major', 'Current Job Title', 'MBA Funding Source', 
                          'Desired Post-MBA Role', 'Reason for MBA']

# Custom Transformer for label encoding
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.encoders = {col: LabelEncoder().fit(X[col]) for col in X.columns}
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            X_transformed[col] = self.encoders[col].transform(X[col])
        return X_transformed



# One-hot encoder
onehot_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False, drop = 'first')



# Column Trnasformer
Feature_encoding = ColumnTransformer(
    transformers = [
        ('label_enc', LabelEncoderTransformer(), label_encode_features),
        ('onehot_enc', onehot_encoder, onehot_encode_features)
    ],
    remainder = 'passthrough'
)



# Custom Transformer to handle column Names
class ColumnNameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, label_features, onehot_features, original_columns):
        self.label_features = label_features
        self.onehot_features = onehot_features
        self.original_columns = original_columns
        self.onehot_encoder_ = None 

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        # Access onehot_encoder_ after pipeline is fiited
        if self.onehot_encoder_ is None:
            self.onehot_ = pipeline.named_steps['Feature_encoding'].named_transformers_['onehot_enc']

        # Get features name after transformation
        onehot_feature_names = self.onehot_encoder_.get_feature_names_out(self.onehot_features)

        # Construct final columns names
        final_columns = self.label_features + list(onehot_feature_names) + [col for col in self.original_columns if 
                                                                            col not in self.label_features + self.onehot_features]

        # Convert transformed array back to Dataframe
        return pd.DataFrame(X, columns = final_columns)


# Final pipeline with columns name handle
final_encoding_pipeline = Pipeline([
    ('Feature_encoding', Feature_encoding),
    ('rename_columns', ColumnNameTransformer(label_encode_features, onehot_encode_features, data_v2.columns))
])


final_encoding_pipeline