In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

In [4]:
data_01 = pd.read_csv(f'mba_decision_dataset.csv')
data_v2 = data_01.copy()
# Convert data_v2 to a DataFrame (if it's not already)
data_v2 = pd.DataFrame(data_v2)
data_v2.head()

Unnamed: 0,Person ID,Age,Gender,Undergraduate Major,Undergraduate GPA,Years of Work Experience,Current Job Title,Annual Salary (Before MBA),Has Management Experience,GRE/GMAT Score,Undergrad University Ranking,Entrepreneurial Interest,Networking Importance,MBA Funding Source,Desired Post-MBA Role,Expected Post-MBA Salary,Location Preference (Post-MBA),Reason for MBA,Online vs. On-Campus MBA,Decided to Pursue MBA?
0,1,27,Male,Arts,3.18,8,Entrepreneur,90624,No,688,185,7.9,7.6,Loan,Finance Manager,156165,International,Entrepreneurship,On-Campus,Yes
1,2,24,Male,Arts,3.03,4,Analyst,53576,Yes,791,405,3.8,4.1,Loan,Startup Founder,165612,International,Career Growth,Online,No
2,3,33,Female,Business,3.66,9,Engineer,79796,No,430,107,6.7,5.5,Scholarship,Consultant,122248,Domestic,Skill Enhancement,Online,No
3,4,31,Male,Engineering,2.46,1,Manager,105956,No,356,257,1.0,5.3,Loan,Consultant,123797,International,Entrepreneurship,On-Campus,No
4,5,28,Female,Business,2.75,9,Entrepreneur,96132,No,472,338,9.5,4.9,Loan,Consultant,197509,Domestic,Skill Enhancement,Online,Yes


In [6]:
data_v2.columns

Index(['Person ID', 'Age', 'Gender', 'Undergraduate Major',
       'Undergraduate GPA', 'Years of Work Experience', 'Current Job Title',
       'Annual Salary (Before MBA)', 'Has Management Experience',
       'GRE/GMAT Score', 'Undergrad University Ranking',
       'Entrepreneurial Interest', 'Networking Importance',
       'MBA Funding Source', 'Desired Post-MBA Role',
       'Expected Post-MBA Salary', 'Location Preference (Post-MBA)',
       'Reason for MBA', 'Online vs. On-Campus MBA', 'Decided to Pursue MBA?'],
      dtype='object')

In [10]:
# Create a pipeline for drop column 

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop =  columns_to_drop

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        return X.drop(columns = self.columns_to_drop, errors = 'ignore')


# Define the dropper pipeline
feature_drop_pipeline = Pipeline([
    ('drop_feature', ColumnDropper(columns_to_drop = ['Person ID']))
])

feature_drop_pipeline

In [12]:
# Create a Pipeline for Label encoding 

# Define features for label encoding 
label_encode_features = ['Gender', 'Has Management Experience', 'Location Preference (Post-MBA)',
                         'Online vs. On-Campus MBA', 'Decided to Pursue MBA?']

# Function to apply label encoding 
def label_encode_columns(X):
    X_copy = X.copy()
    for col in label_encode_features:
        X_copy[col] = LabelEncoder().fit_transform(X_copy[col])
    return pd.DataFrame(
        X_copy,
        columns = X.columns
    )

# scikit-learn compatible transformer for label encoding
label_encoder_transformer = FunctionTransformer(label_encode_columns, validate = False)

feature_label_encode_pipeline = Pipeline([
    ('Label_encoder', label_encoder_transformer)
])

feature_label_encode_pipeline

In [16]:
# Create a pipeline for one-hot encoding

onehot_encode_features = ['Undergraduate Major', 'Current Job Title', 'MBA Funding Source',
                          'Desired Post-MBA Role', 'Reason for MBA']

One_Hot_Encoding_Transformer = ColumnTransformer(
    transformers = [('encoder', OneHotEncoder(handle_unknown = 'ignore'), onehot_encode_features)],
    remainder = 'passthrough' # This ensure other features pass through unchanged
)

One_Hot_encode_pipeline = Pipeline([
    ('One_hot_encoder', One_Hot_Encoding_Transformer)
])

One_Hot_encode_pipeline

In [18]:
# Create pipeline for feature scaling 

numerical_features = ['Age', 'Undergraduate GPA', 'Years of Work Experience',
                      'Annual Salary (Before MBA)', 'GRE/GMAT Score',
                      'Undergrad University Ranking', 'Entrepreneurial Interest',
                      'Networking Importance', 'Expected Post-MBA Salary']


class ScalerWithNames(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features):
        self.numerical_features = numerical_features
        self.scaler = StandardScaler()

    def fit(self, X, y = None):
        self.scaler.fit(X[self.numerical_features])
        return self

    def transform(self, X):
        X_scaled = pd.DataFrame(
            self.scaler.transform(X[self.numerical_features]),
            columns = self.numerical_features,
            index = X.index
        )

        # Combine scaled numerical features with the rest of the column
        X_remaining = X.drop(columns = self.numerical_features)
        X_trasformed = pd.concat([X_scaled, X_remaining], axis = 1)
        return X_trasformed


scaler_with_names = ScalerWithNames(numerical_features = numerical_features)

feature_scaler_pipeline = Pipeline([
    ('feature_scaler', scaler_with_names)
])

feature_scaler_pipeline

In [20]:
# Final combination pipeline 

feature_engineering_pipeline = Pipeline([
    ('Drop_feature', feature_drop_pipeline),
    ('feature_label_encoding', feature_label_encode_pipeline),
    ('Feature_scaling', feature_scaler_pipeline),
    ('Feature_onhot_encoding', One_Hot_encode_pipeline)
])

feature_engineering_pipeline