# Libraries installations for MLP

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
#It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Package fo Dataframe and Visualization for EDA
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Dummy
from sklearn.dummy import DummyClassifier

# Pre-Processing packages

# To enocode Categorical Variables
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# For Standardizing featuires and to balance scale
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# To automate the flow of applied function
from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.compose import ColumnTransformer, make_column_selector

# For handling missing data
from sklearn.impute import SimpleImputer

# Packages for Building Model
from sklearn.svm import SVC, LinearSVC

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier


# Packages for feature engineering & Model development
#For chaecking the performance of the models
from sklearn.metrics import classification_report, confusion_matrix, f1_score, ConfusionMatrixDisplay

# Splitting Data and validation
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV

# for feature engineering
from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFromModel, RFE

from sklearn.decomposition import PCA

# for Disabling Warning prints
import warnings

warnings.filterwarnings('ignore')


# For imbalanced data
from imblearn.over_sampling import SMOTE

# Input data files are available in the read-only "../input/" directory

# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Data Loading Packages

import os



for dirname, _, filenames in os.walk('/kaggle/input'):



    for filename in filenames:



        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Description

The data is related with direct marketing campaigns of a banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.



Input variables:



1. last contact date: last contact date

2. age (numeric)

3. job : type of job

4. marital : marital status (categorical: "married","divorced","single"; note: "divorced" means    divorced or widowed)

5. education (categorical: "unknown","secondary","primary","tertiary")

6. default: has credit in default? (binary: "yes","no")

7. balance: average yearly balance, in euros (numeric)

8. housing: has housing loan? (binary: "yes","no")

9. loan: has personal loan? (binary: "yes","no")

10. contact: contact communication type (categorical: "unknown","telephone","cellular")

11. duration: last contact duration, in seconds (numeric)

12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)

14. previous: number of contacts performed before this campaign and for this client (numeric)

15. poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

    


# **Data Loading**

In [None]:
input_train = "/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv"

input_test = "/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv"

In [None]:
data = pd.read_csv(input_train) # Loading data as a dataframe

In [None]:
df = data.copy()  # Duplicating data to avoid making changes in the original data.

# Exploratory and Statistical Data Analysis

In [None]:
df.head(n=5)

In [None]:
df.shape

* There are 39211 rows a.k.a Observations

* 16 columns that is features

In [None]:
df.info()

* poutcome has less non-null values
* There are 6 Numerical and 10 categorical Columns

In [None]:
# Seperating categorical and numberical columms for applying appropriate function according to type.

cat_col = make_column_selector(dtype_include=object) # categorical columns

num_col = make_column_selector(dtype_include=np.number) # Numerical Columns

In [None]:
df.describe().T # Statistical Properties of numercal columns

Observations

1. Average age of a client 42 years. Maximum is 95 and Minimum is 18.

2. Balance: * Minimum of a balance is -8019 and maximum 102127 in euros. There are outlieirs.


In [None]:
df.describe(include='object') # Statistical properties of categorical variables

In [None]:
# percentage of missing values in the datsaset

percent_missing = ((df.isnull().sum()/ len(df)) * 100 ).round(2)

percent_missing_df = pd.DataFrame({'Column': df.columns,
                                 'percent': percent_missing})

percent_missing_df

In [None]:
#   Removing columns
df = df.drop(columns=['poutcome', 'last contact date'], axis = 1)
#1. poutcome =  more than 50% missing values
#2. last contact date = Not needed for prediction

In [None]:
df[cat_col].isnull().sum() #Looking at nulls in categorical values

There are missing values in categorical columns in











1.   Job



2.   education



3. contact


























In [None]:
df[num_col].isnull().sum() # No null values in numeric columns

In [None]:
# Unique values in job column
count = 1
for col in cat_col(df):
    print(f"{count}.{col}:{list(df[col].unique())}\n")
    count +=1

In [None]:
print(df['target'].value_counts()) # Count of target

yes = df[df['target'] == 'yes'] # Rows having yes in target column

no = df[df['target'] == 'no'] # Rows having no in target column

percentage_yes = (yes.shape[0] / df.shape[0] * 100)
print(f"Percentage of 'yes' values: {percentage_yes:.2f} %")

percentage_no = (no.shape[0] / df.shape[0] * 100)
print(f"Percentage of 'no' values: {percentage_no:.2f} %")

Imbalance in dataset. Out of 39211 rows, 14.86 percent have yes in target column.

# Visualization

 **Starting with Categorical Feautures**

Assumptions made :

Features like Housing, loan, education may be correlated with target as these factors influence the decisions of clients of taking subscriptions or not.

In [None]:
#Checking Association with the target with house feature
house = df.groupby(['target', 'housing']).size().reset_index(name='count')

sns_house = sns.barplot(x='target', y='count', hue='housing', data=house)
sns_house.set_title('Target ascociation with house feature') # Title of the plot
sns_house.set_xlabel('Target') # X label
sns_house.set_ylabel('Housing') # y abel

Clients having subscriptions mostly don't have housing loan. Whereas who do not have subscription have housing loans.

In [None]:
#Grouped target with education to check it's significance
edu = df.groupby(['target', 'education']).size().reset_index(name='count')

edu = sns.barplot(x='target', y='count', hue='education', data=edu)
sns.set_palette("husl")
sns.set_style("darkgrid")
edu.set_title('Target ascociation with education feature')
edu.set_xlabel('Target')
edu.set_ylabel('Education feature')

Both of the classes have same pattern

In [None]:
# Checking any influence of loan on target
loan =df.groupby(['target', 'loan']).size().reset_index(name='count')

loan = sns.barplot(x='target', y='count', hue='loan', data=loan)
sns.set_theme(style="whitegrid",color_codes= True)
sns.set_style("darkgrid")
loan.set_title('Loan feature check ')
loan.set_xlabel('Target')
loan.set_ylabel('Loan')

No difference

In [None]:
#check count based on categorical features
plt.figure(figsize=(15,80), facecolor='white')

plotnumber =1

for col in cat_col(df):

    ax = plt.subplot(10,2,plotnumber) #Plot each category in categorical column

    sns.countplot(y=col,data=df)

    plt.title(col)

    plotnumber+=1

plt.show()

1. In job: Most of the jobs have "Blue collar" and "Management"

In [None]:
job = df.groupby(['target', 'job']).size().reset_index(name='count')

job = sns.barplot(x='target', y='count', hue='job', data=job, palette='husl', dodge=True)
sns.set_theme(style="whitegrid",color_codes= True)
job.set_title(' Asscociation between target and job category')
job.set_xlabel('Target')

* Jobs like management is high in both class, service and entrprenaur is low in yes class.
* Blue-collar is high in no class followedby management and technician and services.  

In [None]:
# Now check with every categorical column
categorical_data_with_target = []

for categorical_feature in  iter(cat_col(df)) :
     grouped_data= df.groupby(['target',categorical_feature]).size()
     grouped_data['feature'] = categorical_feature
     categorical_data_with_target.append(grouped_data)
categorical_data_with_target

In [None]:
# Finally check with categorical to know any insights with visualization
for categorical_feature in iter(cat_col(df)):

    sns.catplot(x='target', col=categorical_feature, kind='count', data= df)
plt.show()

**Numerical Features**

In [None]:
# Plotting age feature as a barplot to know it's distribution.
ax = sns.histplot(df,x= 'age', kde = True,hue = 'target')
ax.set(xlabel='Age', ylabel='Frequency')
ax.set_title('Age Distribution')


From the graph the mean age is in the range of 30 - 40 group



and it is a left skewed Normal Distribution

In [None]:
# Checking distributions of the numerical columns
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for col in num_col(df):
    ax = plt.subplot(12,3,plotnumber)
    sns.histplot(df[col])
    plt.xlabel(col)
    plotnumber+=1
plt.show()

* There are outliers present in the data and most of the observations are consistent (eg. previous, pdays )
* Balance is mostly different for every clients hence, specific pattern.

In [None]:
plt.figure(figsize=(20,60), facecolor='white')
plotnumber =1
for feature in num_col(df):
    ax = plt.subplot(12,3,plotnumber)
    sns.boxplot(x="target", y= df[feature], data=df)
    plt.xlabel(feature)
    plotnumber+=1
plt.show()

1. In Balance it is expected to have outliers, as it is logical that not each and every client will have the exact balance as everyone.

2. Duration, it is seen that higher the duration, higher the count of yes, it relays that clients who have conversated for longer duration have subscribed.

3. Same with campaign and pdays and previous

In [None]:
sns.heatmap(df[num_col].corr(), annot=True)

plt.title('Correlation Heatmap')

plt.show()

* Duration almost have good co-relation with every feature.
* Age deos not impact much on any feature
* Aside all the other feautures are fairly co-related.

# **Data Splitting**

In [None]:
# Seprationg data and target(one to predict)
X = df.drop('target', axis = 1) # X = All the features columns (data)
y = df['target'] # target column (to predict)

In [None]:
# Dividing the data further into training set (to train the model)
# And in validation or test set to check the performance of model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
# After spiltting data the number of rows in respected sets are
print(f"X_val shape: {X_val.shape}") # 39211 * 0.1 = 3922
print(f"X_train shape: {X_train.shape}") # 39211 - 3922 = 35289

# PRE- PROCESSING

In [None]:
# cat_pipe = Creating transformations pipeline for  categorical data
cat_pipe = Pipeline(steps = [


      #imp for Imputer
    ('imp', SimpleImputer(strategy='most_frequent')), # to fill any missing value in categorical data


     # ohe for OneHotEncoder
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop = 'first', sparse_output=False))])
     # Encode all the categorical data into numerics without any order



# Using ColumnTransformer to tranform selected column in line
transform_df  = ColumnTransformer(transformers=[

    ('cat', cat_pipe,cat_col(X)),
     # Applying transformation on categorical columns

    #StandardScaler to bring all features unit to same scale range which is between 0 and 1
    ('num', StandardScaler() ,num_col(X))
     # Applying transformation on Numerical  columns


    ]).set_output(transform='pandas') # To get the output as a dataframe

In [None]:
# Applying transformation on train and validation data
tr_train = transform_df.fit_transform(X_train)
tr_val = transform_df.transform(X_val)

In [None]:
print("After Pre- processing:")
print(f"Train Data shape: {tr_train.shape}\nValidatioin data Shape: {tr_val.shape}" )

# FEATURE ENGINEERING

**Select K BEST**

In [None]:
k_select = SelectKBest(mutual_info_classif, k=10) # Want 10 relevent features
k_train = k_select.fit_transform(tr_train, y_train)

In [None]:
print(f"Selected by K best:",k_train.shape) # columns after transforming

In [None]:
list(k_select.get_feature_names_out()) # Names of the features after selecting

In [None]:
# to check how relevent are these features
count =1
no_of_features = 10  # the number of most relevant features
features_with_score= []
feature_scores = mutual_info_classif(tr_train, y_train, random_state=0)
for score, name in sorted(zip(feature_scores, tr_train.columns), reverse=True)[:no_of_features]:
        print(f"{count}.{name}: {score}")
        features_with_score.append(name)
        count+=1
df_imp_featutes = tr_train[features_with_score]


In [None]:
k_val = k_select.transform(tr_val)

**Recursuve Feature Elimination**

In [None]:
# This select features by removing the least important features one by one
#selector_rfe = RFE(LogisticRegression(random_state=8), n_features_to_select=10)
#selector_fre_train = selector_rfe.fit(tr_train, y_train)

In [None]:
#list(selector_rfe.get_feature_names_out())

![rfe_select.png](attachment:402f76ac-4f6f-4166-b2b8-a16429830798.png)

In [None]:
# Transform the data set
#rfe_train = selector_rfe.transform(tr_train)
#rfe_val = selector_rfe.transform(tr_val)

PCA

In [None]:
#pca = PCA(n_components=10)



#pca.fit(tr_train)


In [None]:
#pca_train = pca.transform(tr_train)

#pca_val = pca.transform(tr_val)


In [None]:
#pca.get_feature_names_out()

# Models

 **Model 1: Logistics Regression**

In [None]:
# Applying Logistics Regression without hyperparameter tuning first
logi = LogisticRegression(class_weight= 'balanced').fit(k_train, y_train)
# Using parameter beforehand as dataset is imbalance, as this parameter takes into consideration.

logi_cross = cross_val_score(logi,k_val , y_val, cv=5, scoring='f1_macro')
#crossvalscore creates folds from the provided dataset to test the model performance
print("Mean:",logi_cross.mean().round(2),"\n","Standard Deviation: ", logi_cross.std().round(2))
print("scores:", logi_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, logi.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, logi.predict(k_val)))



Observations:
1. Since the data is imbalance, 94 % of precision from 3364 samples and only 38 % from 558 samples.
2. f1 score for both classes are not in the same range either
3. the train and validation set have done almost same.

In [None]:
"""

param_grid = { #
    'penalty': ['l1', 'l2'], # l1 sum of absolute values of the model coefficients
    'C': [0.001, 0.01, 0.1, 1],
    'solver': ['liblinear', 'newton-cholesky ' ]
}

grid_search = GridSearchCV(estimator=logi,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='f1_macro',  # scoring metric
                           n_jobs=-1)


grid_search.fit(k_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
"""

![logi.png](attachment:52a8ad77-2ccb-4979-97f6-7a26506f5841.png)

In [None]:
# Using the parameters chosen by the Grid_Search_CV for better performance of the model
logistic_grid = LogisticRegression(solver= 'liblinear',
                                   penalty= 'l2',C= 0.001,random_state=8,
                                   class_weight='balanced' ).fit(k_train, y_train)
logistic_grid_pred = logistic_grid.predict(k_val)

In [None]:
# Using Cross Validation for evaluation of different fold of dataset to check it's performance
logi_cross = cross_val_score(logistic_grid,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",logi_cross.mean().round(2),"\n","Standard Deviation: ", logi_cross.std().round(2))
print("scores:", logi_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train,logistic_grid.predict(k_train) ))


print("Classification report of Validation data:")
print(classification_report(y_val, logistic_grid_pred))

There is not much of a differenc after hyperparameter tuning as f1 score has not changed much

In [None]:
cm = confusion_matrix(y_val, logistic_grid_pred, labels = ['yes', 'no'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of Logistics Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

2993 of label no are correctly predicted, and 320 of label yes
which 3313 out of 39211

**Model 2: SVC (Support Vector Classification)**

In [None]:
# Train scv model without any parameters
svc = SVC(class_weight='balanced').fit(k_train, y_train)
# Taking class_weight beforehand as data is imbalanced

In [None]:
svc_cross = cross_val_score(svc,k_val , y_val, cv=5, scoring='f1_macro')
print("Mean:",svc_cross.mean().round(2),"\n","Standard Deviation: ", svc_cross.std().round(2))
print("scores:", svc_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, svc.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, svc.predict(k_val)))


SVC seems have done a little better, for f1 score of yes label is more than 50 % and the mean of cross_val is 0.71, precision is fairly low

In [None]:
"""
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']

}

# Create a GridSearchCV
grid_search = GridSearchCV(estimator=svc,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro',
                           n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(tr_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

"""

![SVC.png](attachment:102e7afd-6f03-417d-afe2-5ca0eef27a1d.png)

In [None]:
# Using parameters from grid
svc_grid = SVC(random_state=8, class_weight='balanced',C= 0.1,kernel='linear').fit(k_train, y_train)
svc_grid_pred =svc_grid.predict(k_val)

In [None]:
# Using Cross Validation for evaluation of different fold of dataset to check it's performance
svc_cross = cross_val_score(svc_grid,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",svc_cross.mean().round(2),"\n","Standard Deviation: ", svc_cross.std().round(2))
print("scores:", svc_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, svc_grid.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, svc_grid_pred))

In [None]:
cm = confusion_matrix(y_val, svc_grid_pred, labels = ['yes', 'no'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of SVC Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

SVC has done better than Logistics, for yes label the False Negative are low, and number of True postives has increased but False postive for no label has increased

**Model 3 : LinearSVC**

In [None]:
lin_svc = LinearSVC(random_state=8, class_weight='balanced').fit(k_train, y_train)

In [None]:
lin_svc_cross = cross_val_score(lin_svc,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",lin_svc_cross.mean().round(2),"\n","Standard Deviation: ", lin_svc_cross.std().round(2))
print("scores:", lin_svc_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, lin_svc.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, lin_svc.predict(k_val)))


In [None]:
"""
param_grid = {
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'loss': ['squared_hinge'],
    'dual': [False],  # Sample > features
    'max_iter': [1000, 2000]
}

# Create a GridSearchCV
grid_search = GridSearchCV(estimator=lin_svc,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro',
                           n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(k_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
"""

![lin_svc.png](attachment:b848b1ce-b101-4ace-8e7e-1a23aa12c86a.png)

In [None]:
lin_svc_grid = LinearSVC(penalty = 'l2',loss = 'squared_hinge',  dual=False, C= 0.01, max_iter=1000,
                         class_weight = "balanced"
                          ).fit(k_train, y_train)

lin_svc_grid_pred = lin_svc_grid.predict(k_val)

In [None]:
lin_svc_cross = cross_val_score(lin_svc_grid,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",lin_svc_cross.mean().round(2),"\n","Standard Deviation: ", lin_svc_cross.std().round(2))
print("scores:", lin_svc_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, lin_svc_grid.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val,lin_svc_grid_pred))


In [None]:
m = confusion_matrix(y_val,lin_svc_grid_pred , labels = ['yes', 'no'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of Linear_SVC Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

Linear_svc is overall low by it's average mean and similar to svc performance

**Model 5: DecisionTree Classifier**

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=8, class_weight='balanced').fit(k_train, y_train)

In [None]:
dt_grid = DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=5, random_state=8).fit(k_train, y_train)

dt_grid_pred = dt_grid.predict(k_val)

In [None]:
tree_cross = cross_val_score(dt_classifier,k_val , y_val, cv=5, scoring='f1_macro')
print("Mean:",tree_cross.mean().round(2),"\n","Standard Deviation: ", tree_cross.std().round(2))
print("scores:", tree_cross)

In [None]:
"""
# Define the parameter grid for binary classification
param_grid = {
    'criterion': ['gini', 'entropy'],  # Splitting criterion
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=dt_classifier,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='f1_macro',  # Or other relevant binary classification metrics
                           n_jobs=-1)  # Use all available CPU cores

# Fit the GridSearchCV to the training data
grid_search.fit(k_train, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
"""

![Screenshot 2025-08-24 204123.png](attachment:200f5bc3-9de1-4e29-a38e-cbb266d1d65a.png)

In [None]:
tree_cross = cross_val_score(dt_grid,k_val , y_val, cv=5, scoring='f1_macro')
print("Mean:",tree_cross.mean().round(2),"\n""Standard Deviation: ", tree_cross.std().round(2))
print("scores:", tree_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, dt_classifier.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, dt_classifier.predict(k_val)))


In [None]:
print("Classification report of train data:")
print(classification_report(y_train, dt_grid.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, dt_grid_pred))


In [None]:
m = confusion_matrix(y_val,dt_grid_pred , labels = ['yes', 'no'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of DecisionTreeClassifier Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

**Model 4: Random Forest Classifier**

In [None]:
rf_classifier = RandomForestClassifier(random_state=8, class_weight='balanced').fit(k_train, y_train)

In [None]:
rf_cross = cross_val_score(rf_classifier,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",rf_cross.mean().round(2),"\n","Standard Deviation: ", rf_cross.std().round(2))
print("scores:", rf_cross)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, rf_classifier.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, rf_classifier.predict(k_val)))


* Random forest has perfectly well predicted the train data set with the score of 1.0
* With Validation Set, recall and f1 score are less than 50 %

In [None]:
"""
# Define the parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node

}
grid_search_rf = GridSearchCV(estimator=rf_classifier,
                              param_grid=param_grid_rf,
                              cv=5,
                              scoring='f1_macro',
                              n_jobs=-1)

grid_search_rf.fit(k_train, y_train)

print("Best parameters:", grid_search_rf.best_params_)

# Get the best score
print("Best score:", grid_search_rf.best_score_)
"""

![rf.png](attachment:4f941eb3-711e-4814-815a-7eeaa3beddad.png)

In [None]:
rf_grid = RandomForestClassifier(n_estimators= 200, max_depth = None,
                                min_samples_leaf=2, min_samples_split= 10,
                                class_weight='balanced').fit(k_train, y_train)

rf_grid_pred = rf_grid.predict(k_val)

In [None]:
print("Classification report of train data:")
print(classification_report(y_train, rf_grid.predict(k_train)))


print("Classification report of Validation data:")
print(classification_report(y_val, rf_grid_pred))

In [None]:
rf_cross = cross_val_score(rf_grid,k_val , y_val, cv=5, scoring='f1_macro')
                            # here used grid chosen model with 5 fold
print("Mean:",rf_cross.mean().round(2),"\n","Standard Deviation: ", rf_cross.std().round(2))
print("scores:", rf_cross)

In [None]:
m = confusion_matrix(y_val,rf_grid_pred , labels = ['yes', 'no'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of RandomForestClassifier Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

1. Overall Good score out of the model scores, every score is close to  50 %
2. Hyperparameter significantly improved the model performance.
3. Mean changed from 0.64 to 0.73 almost 10% increase.
4. But the True Postives of label yes is low comparitevly to SVC or LinearSVC model

* Just like Random Forest, 100% accuracy in the train data set,

* but because of data imbalance and less number of yes label, the scores are also low in the       validation set
* precision, recall, f1-score, macro-average  is same for their respective classes (maybe because of the class_weight parameter )

**Model 5: XGBClassifier**

In [None]:
# For XGB Classifier to work the target classes should be numeric
le = LabelEncoder()

y_train_xg = le.fit_transform(y_train) # encode for train data set

y_val_xg =le.fit_transform(y_val) # encode for validation data set

# To know enoced labels correctly
print("For 0 it is encoded as:" ,le.inverse_transform([0]))
print("For 1 it is encoded as:" ,le.inverse_transform([1]))

In [None]:
#Initiate XGBClassifier first
xgb_classifier = XGBClassifier().fit(k_train, y_train_xg)

In [None]:
xgb_cross = cross_val_score(xgb_classifier,k_val , y_val_xg, cv=5, scoring='f1_macro')
print("Mean:",xgb_cross.mean().round(2),"\n","Standard Deviation: ", xgb_cross.std().round(2))
print("scores:", xgb_cross)

In [None]:
print("Classification report of train data:")
print(" 0: 'no', 1:'yes'") # For convenience
print(classification_report(y_train_xg, xgb_classifier.predict(k_train)))


print("Classification report of Validation data:")
print(" 0: 'no', 1:'yes'")
print(classification_report(y_val_xg, xgb_classifier.predict(k_val)))


In [None]:
"""

# Define the parameter grid for binary classification
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage used in update to prevents overfitting
    'max_depth': [3, 5, 7],  # Maximum depth of a tree
    'n_estimators': [100, 200, 300],  # Number of boosting rounds (trees)
    'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instance
    'colsample_bytree': [0.8, 0.9, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier,
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='f1_macro',  # Or other relevant binary classification metrics
                           n_jobs=-1)  # Use all available CPU cores

# Fit the GridSearchCV to the training data
grid_search.fit(k_train, y_train_xg)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
"""

![xgb.png](attachment:7f25319d-5917-4a85-95b4-6636d91a754e.png)

In [None]:
xgb_grid = XGBClassifier(objective='binary:logistic',colsample_bytree=1.0,
                        learning_rate=0.1, max_depth=5,
                        n_estimators=200, subsample= 1.0).fit(k_train, y_train_xg)

xgb_grid_pred=xgb_grid.predict(k_val)

In [None]:
xgb_cross = cross_val_score(xgb_grid,k_val , y_val_xg, cv=5, scoring='f1_macro')
print("Mean:",xgb_cross.mean().round(2),"\n","Standard Deviation: ", xgb_cross.std().round(2))
print("scores:", xgb_cross)

In [None]:
print("Classification report of train data:")
print(" 0: 'no', 1:'yes'")
print(classification_report(y_train_xg, xgb_grid.predict(k_train)))


print("Classification report of Validation data:")
print(" 0: 'no', 1:'yes'")
print(classification_report(y_val_xg, xgb_grid_pred))

* Recall is low for yes label, which impacted f1 score
* macro avg seems to be

In [None]:
m = confusion_matrix(y_val_xg,xgb_grid_pred , labels = ['0', '1'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['yes', 'no'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix of XGBClassifier Model', fontsize=15, pad=20)
plt.xlabel('Prediction', fontsize=11)
plt.ylabel('Actual', fontsize=11)
plt.show()

1. The confusion matrix is same as everyone except svc model.
2. In SVC the True Positives of "yes" labels were more, but same also for False negetives for 'no' label
3. In other models, as above the False Negetives for no labels is decreased, correct prediction for 'yes' label is low

# Comparing Models

In [None]:
name = ['Logistics Regression',"SVC",'Linear_SVC',
        'Random Forest Classifier',"DecisionTreeClassifier","XGBClassifier"]

classifiers = [
    logi_cross.mean().round(2),
    svc_cross.mean().round(2),
    lin_svc_cross.mean().round(2),
    rf_cross.mean().round(2),
    tree_cross.mean().round(2),
    xgb_cross.mean().round(2)

]

scores = {} # Create a dictionary to store the score of the models
for name, score in zip(name, classifiers): # Name of the model  and model

    # WIll store score of the respective model
    scores[name] = score

In [None]:
sorted_scores = {key: value for key, value in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
sorted_scores  # List the scores from highest to lowest

Observation:
 * By comparing the classificatiion report of the all models and by comparing scores
* Random Forest Classifier had done better than everyone. Hence the Final model Chosen.

# Test Modification

In [None]:
test = pd.read_csv(input_test) # Loading test data as dataframe
test.head(n=5) # same as train data

In [None]:
test.shape

In Test data set, there 10,000 rows and 15 columns

In [None]:
# Will do the exact modification done on train data to further predict
test = test.drop(columns=['poutcome', 'last contact date'], axis = 1)
print("After dropping the columns, the shape is now",test.shape)

In [None]:
test_tr = transform_df.transform(test) # Applying pre- processiong to test data using pipeline
print("After pre-processing:",test_tr.shape)

In [None]:
k_test_tr = k_select.transform(test_tr)
# Apply K_select_feature to reduce it's dimenionaltiy and remobe least important features
print("Transformed Data shape:", k_test_tr.shape)

# Predictions

In [None]:
#Using Random Forest as final model to predict
rf_pred_test =rf_classifier.predict(k_test_tr) # predicting and storing in the variable for ease

# SUBMISSION

In [None]:
submission = pd.DataFrame( { "id" : np.arange(0, 10000),



                           'target' : rf_pred_test }
                         )
submission.to_csv('submission.csv', index = False)