# About this File
The purpose of a source code pipeline is to streamline the end-to-end process of treating missing values, label encoding, feature transformation, model building, and preprocessing pipeline for machine learning applications. This involves automating the steps involved in data preprocessing, model development, and deployment, aiming to increase efficiency, consistency, and reproducibility in the development and deployment of machine learning models.

In [2]:
#Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from imblearn.combine import SMOTEENN
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

This function is designed to preprocess and clean the input data by handling missing values and creating new features to prepare the data for further analysis or modeling.

In [3]:
def treat_missing_values(data):
    # Columns that are required
    columns_to_keep = ['custAge', 'profession', 'marital', 'schooling', 'default', 'housing',
                       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
                       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                       'euribor3m', 'nr.employed', 'pmonths', 'pastEmail', 'responded']
    
    data = data[columns_to_keep]

    # Feature engineering for schooling
    schooling_category = {
        'basic.4y' : 'basic',
        'basic.6y' : 'basic',
        'basic.9y' : 'basic',
        'high.school': 'high.school',
        'illiterate':'illiterate',
        'professional.course': 'professional.course',
        'university.degree':'university.degree',
        'unknown':'unknown',
    }

    data.loc[:,'schooling'] = data['schooling'].replace(schooling_category)

    # Imputing the missing values in education based on profession
    imputation_mapping = {
        'blue-collar' : 'basic',
        'self-employed': 'illiterate',
        'technician'   : 'professional.course',
        'admin.'        : 'university.degree',
        'services'      : 'high.school',
        'management'    : 'university.degree',
        'retired'       : 'unknown',
        'entrepreneur'  : 'university.degree'
        }

    data.loc[:,'schooling'] = data['schooling'].combine_first(data['profession'].map(imputation_mapping))
    
    # Profession & treating missing values of age
    data.loc[:, 'employment_status'] = data['profession'].apply(lambda x: 'retired' if x == 'retired' else ('student' if x == 'student' else 'working'))

    # Imputing age values
    mean_age_retired = data.loc[data['employment_status'] == 'retired', 'custAge'].mean()
    mean_age_student = data.loc[data['employment_status'] == 'student', 'custAge'].mean()
    median_age_working = data.loc[data['employment_status'] == 'working', 'custAge'].median()

    data.loc[:,'custAge'] = np.where((data['employment_status'] == 'retired') & data['custAge'].isna(), mean_age_retired, data['custAge'])
    data.loc[:,'custAge'] = np.where((data['employment_status'] == 'student') & data['custAge'].isna(), mean_age_student, data['custAge'])
    data.loc[:,'custAge'] = np.where((data['employment_status'] == 'working') & data['custAge'].isna(), median_age_working, data['custAge'])

    # Impute random day for missing 'day_of_week' values
    data.loc[:,'day_of_week'] = data['day_of_week'].apply(lambda day: np.random.choice(['mon', 'tue', 'wed', 'thu', 'fri']) if pd.isna(day) else day)

    # Drop remaining missing values
    data = data.dropna()

    return data

This function can be used to preprocess the data before training a machine learning model. It helps in converting categorical variables into a format suitable for model training and can also create new features from existing ones.

In [4]:
def label_encoding(data):
    # Label encoding for 'profession'
    data.loc[:,'profession'] = data['profession'].map({'student': 'Dependents', 'retired': 'Dependents', 'unemployed': 'Unemployed&Unknown', 'unknown': 'Unemployed&Unknown',
                                                 'admin.': 'Working', 'blue-collar': 'Working', 'entrepreneur': 'Working', 'housemaid': 'Working',
                                                 'management': 'Working', 'self-employed': 'Working', 'services': 'Working', 'technician': 'Working'})

    # Label encoding for 'marital'
    data.loc[:,'marital'] = data['marital'].map({'single': 'Single&Divorced', 'divorced': 'Single&Divorced', 'married': 'married', 'unknown': 'Unknown'})

    # Label encoding for 'schooling'
    data.loc[:,'schooling'] = data['schooling'].map({'basic': 'Uneducated&BasicEducation', 'high.school': 'Uneducated&BasicEducation',
                                               'illiterate': 'Uneducated&BasicEducation', 'unknown': 'Unknown',
                                               'professional.course': 'Educated', 'university.degree': 'Educated'})

    # Label encoding for 'default'
    data.loc[:,'default'] = data['default'].map({'no': 'No', 'unknown': 'Yes&Unknown', 'yes': 'Yes&Unknown'})

    # Create a copy of the DataFrame to avoid modifying the original data
    data_copy_c = data.copy()

    # Define a mapping for specific months
    quarter_mapping = {'dec': 'QuarterEnd', 'sep': 'QuarterEnd', 'jun': 'QuarterEnd', 'mar': 'QuarterEnd'}

    # Replace specified months with 'QuarterEnd' in the copied DataFrame
    data_copy_c['month_mapped'] = data_copy_c['month'].replace(quarter_mapping)

    # Replace other months with 'others' in the copied DataFrame
    data_copy_c['month_mapped'].replace(to_replace=data_copy_c['month_mapped'][~data_copy_c['month_mapped'].isin(['QuarterEnd'])].unique(), value='others', inplace=True)

    data.loc[:,'month'] = data_copy_c['month_mapped']

    # Label encoding for 'day_of_week'
    data.loc[:,'day_of_week'] = data['day_of_week'].map({'mon': 'WeekBeginning', 'tue': 'WeekBeginning', 'wed': 'WeekBeginning',
                                                   'thu': 'WeekEnding', 'fri': 'WeekEnding'})

    # Feature engineering of other variables
    # pdays
    conditions = [
        (data['pdays'] == 999),
        (data['pdays'] < 5),
        ((data['pdays'] >= 5) & (data['pdays'] <= 10)),
        (data['pdays'] > 10)
    ]

    choices = ['first visit', 'less than 5 days', '5 to 10 days', 'greater than 10 days']

    # Create the 'pduration' column based on conditions
    data.loc[:,'pduration'] = np.select(conditions, choices, default='unknown')

    # pmonths
    conditions = [
        (data['pmonths'] == 999),
        (data['pmonths'] <= 0.2),
        (data['pmonths'] > 0.2)
    ]

    choices = ['first visit', 'less than 2 months', 'greater than 2 months']

    # Create the 'pduration' column based on conditions
    data['pduration_m'] = np.select(conditions, choices, default='unknown')

    return data


The provided Python function performs feature transformation on a given dataset. It includes several key steps to prepare the data for machine learning tasks.

In [5]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def feature_transformation(data):
    # Drop target and unnecessary columns
    X = data.drop(['responded', 'pdays', 'pmonths', 'employment_status'], axis=1)
    y = data['responded']

    # One-hot encode categorical columns
    X_encoded = pd.get_dummies(X, columns=['loan', 'marital', 'schooling', 'default', 'housing', 'day_of_week',
                                           'poutcome', 'pduration', 'pduration_m', 'profession', 'month', 'contact'], drop_first=True)

    # Identify continuous columns for normalization
    continuous_columns = ['custAge', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                           'euribor3m', 'nr.employed', 'pastEmail']

    # Extract the continuous columns from X_encoded
    X_continuous = X_encoded[continuous_columns]

    # Instantiate StandardScaler
    scaler = StandardScaler()

    # Fit and transform the scaler on the continuous data
    X_continuous_normalized = scaler.fit_transform(X_continuous)

    # Replace the original continuous columns in X_encoded with the normalized ones
    X_encoded[continuous_columns] = X_continuous_normalized

    return X_encoded, y

In this training model a random seed provides consistency and reproducibility in the generation of random numbers, which is critical for the development and evaluation of machine learning models.

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

def train_propensify_model(X_encoded, y):
    # Set a random seed for reproducibility
    np.random.seed(78)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=78)
    
    # Define RandomForestClassifier
    rf = RandomForestClassifier()
    
    # Define parameter grid for Random Forest
    param_grid = {
       'n_estimators': [10, 20, 30],
       'max_depth': [None, 10, 20],
       'min_samples_split': [2, 5, 10],
       'min_samples_leaf': [1, 2, 4]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Apply SMOTEENN to the training data
    smoteenn = SMOTEENN(random_state=42)
    X_train_resampled, y_train_resampled = smoteenn.fit_resample(X_train, y_train)

    # Create a Random Forest classifier
    rf = RandomForestClassifier(random_state=42)

    # Create a Support Vector Machine (SVM) classifier with tuned parameters
    svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)

    # Ensemble the classifiers using a VotingClassifier
    ensemble_classifier = VotingClassifier(estimators=[
        ('rf', rf),
        ('svm', svm_classifier)
    ], voting='hard')  # 'hard' for probability voting

    # Fit the ensemble model on the resampled training data
    ensemble_classifier.fit(X_train_resampled, y_train_resampled)



    return ensemble_classifier

Load tain and test datasets.

In [7]:
train_data = pd.read_excel(r"C:\Users\Zimm\Downloads\Propensify\train.xlsx")
test_data = pd.read_excel(r"C:\Users\Zimm\Downloads\Propensify\test.xlsx")
test_data['responded'] = 10

Analyse the pipeline.

In [8]:
preprocessing_pipeline = Pipeline([
    ('missing_values', FunctionTransformer(func=treat_missing_values)),
    ('label_encoding', FunctionTransformer(func=label_encoding)),
    ('feature_transformation', FunctionTransformer(func=feature_transformation)),
])

# Fit the pipeline on the training data
X_train_transformed, y_train = preprocessing_pipeline.fit_transform(train_data)

# Train model using the transformed data
trained_model = train_propensify_model(X_train_transformed, y_train)

# Save the preprocessing pipeline
joblib.dump(preprocessing_pipeline, 'preprocessing_pipeline.joblib')

# Save the trained model
joblib.dump(trained_model, 'propensify.joblib')


['propensify.joblib']

Load the train model and preprocessing pipeline.

In [9]:
load_model = joblib.load('propensify.joblib')

preprocessing_pipeline = joblib.load('preprocessing_pipeline.joblib')

Analyze the test data on pipeline transformation.

In [10]:
X_test_transformed, _ = preprocessing_pipeline.transform(test_data)

Predictions on data.

In [11]:
predictions = load_model.predict(X_test_transformed)

In [12]:
# Add a new column 'Predictions' to the preprocessed test data
X_test_transformed['Predicted_Response'] = predictions

# Display the DataFrame with all columns and predictions
display(X_test_transformed)

Unnamed: 0,custAge,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pastEmail,loan_unknown,...,pduration_first visit,pduration_greater than 10 days,pduration_less than 5 days,pduration_m_greater than 2 months,pduration_m_less than 2 months,profession_Unemployed&Unknown,profession_Working,month_others,contact_telephone,Predicted_Response
0,-0.188221,-0.207980,1.713222,-0.763555,1.076265,0.650961,-1.593395,-2.849255,1.354201,0,...,1,0,0,0,0,0,1,0,0,yes
1,-0.507809,-0.207980,1.713222,-2.231472,-2.077163,2.321153,-1.650120,-2.097493,1.354201,0,...,0,0,1,0,1,0,1,0,0,yes
2,1.090129,-0.565991,1.713222,-1.210312,-1.186580,-1.236139,-1.338134,-0.959390,1.354201,0,...,1,0,0,0,0,0,1,1,0,no
3,-1.040455,-0.565991,-0.347067,0.832007,-0.231888,0.954632,0.767049,0.839818,-0.274884,0,...,1,0,0,0,0,0,1,1,0,no
4,-0.081692,-0.565991,-0.347067,-0.125330,-0.654655,-0.325125,0.297622,0.389319,-0.274884,0,...,1,0,0,0,0,0,1,1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,-0.188221,-0.565991,-0.347067,0.640540,0.721071,0.889560,0.705114,0.322371,-0.274884,0,...,1,0,0,0,0,0,1,1,1,no
32946,-0.827396,0.508043,-0.347067,-1.210312,-1.186580,-1.236139,-1.373442,-0.959390,-0.274884,0,...,1,0,0,0,0,0,1,1,0,yes
32947,-0.827396,-0.207980,-0.347067,-1.210312,-1.186580,-1.236139,-1.354341,-0.959390,-0.274884,0,...,1,0,0,0,0,0,1,1,0,yes
32948,-0.827396,-0.565991,-0.347067,0.832007,1.537150,-0.281744,0.764154,0.839818,-0.274884,0,...,1,0,0,0,0,0,1,0,1,no


In [13]:
X_test_transformed['Predicted_Response'].value_counts()

no     23931
yes     8214
Name: Predicted_Response, dtype: int64

Saved the result to csv and excel. Because the data are in xlsx format while in the project Task/Activities list asking for "testingCandidate.csv" file 

In [14]:
csv_file_path = 'testing_Candidate.csv'
X_test_transformed.to_csv(csv_file_path, index=False)

In [15]:
excel_file_path = 'testing_Candidate.xlsx'
X_test_transformed.to_excel(excel_file_path, index=False)