In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Importing dataset and examining it
dataset = pd.read_csv("Leads X Education.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())




                            Prospect ID     Lead Source  Converted  \
0  8cc8c611-a219-4f35-ad23-fdfd2656bd8a  Direct Traffic          1   
1  0cc2df48-7cf4-4e39-9de9-19797f9b38cc  Direct Traffic          0   
2  3256f628-e534-4826-9d63-4a8b88782852          Google          1   
3  9fae7df4-169d-489b-afe4-0f3d752542ed          Google          1   
4  2a369e35-ca95-4ca9-9e4f-9d27175aa320  Organic Search          1   

   TotalVisits  Total Time Spent on Website  Page Views Per Visit  \
0            2                         1532                   2.0   
1            1                          305                   1.0   
2            2                         1428                   1.0   
3            2                         1640                   2.0   
4            8                         1351                   8.0   

       Last Activity Country           Specialization  \
0       Email Opened   India  Business Administration   
1        Unreachable   India    Media and Advertis

In [None]:
dataset["Asymmetrique Activity Index"]=dataset["Asymmetrique Activity Index"].replace(["01.High", "02.Medium","03.Low"],["High","Medium","Low"])
dataset["Asymmetrique Profile Index"]=dataset["Asymmetrique Profile Index"].replace(["01.High", "02.Medium","03.Low"],["High","Medium","Low"])
#cleaning string values in Asymmetrique Activity Index and Asymmetrique Profile Index

In [None]:
# Creating a list of Object data type columns
obj_cols = dataset.select_dtypes(np.object).columns.tolist()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [None]:
# Checking the categorical values in the Object columns
def check_value_counts(col_list):
  for col in col_list:
    print('-----------------------------')
    print(round((dataset[col].value_counts()/dataset.shape[0])*100,2))
    print('-----------------------------')

check_value_counts(obj_cols)

-----------------------------
8cc8c611-a219-4f35-ad23-fdfd2656bd8a    0.03
06b92b61-bd27-4ecb-84f5-6a38c0082616    0.03
59306794-fa88-4bac-91e3-a87d4ecdfb83    0.03
92677aef-11cd-411c-b636-f601d673e46c    0.03
c7cc4b98-b373-48c1-bede-d2c81470f4cc    0.03
                                        ... 
f9898c5d-ea4d-4d40-a897-0e6a3873653a    0.03
216d2ac0-077f-4e65-8c39-598a3282afeb    0.03
72fb7233-ba78-4a3d-8c83-07ac1725cc6a    0.03
be8ed098-5821-4e04-b6c2-dbad5f224943    0.03
571b5c8e-a5b2-4d57-8574-f2ffb06fdeff    0.03
Name: Prospect ID, Length: 3474, dtype: float64
-----------------------------
-----------------------------
Google              39.95
Direct Traffic      39.44
Organic Search      17.47
Referral Sites       1.30
Olark Chat           1.07
Reference            0.58
Social Media         0.06
WeLearn              0.03
testone              0.03
Facebook             0.03
Welingak Website     0.03
Press_Release        0.03
Name: Lead Source, dtype: float64
---------------------

In [None]:
# Dropping columns which do not add value to the dataset
dataset = dataset.drop(columns=['Prospect ID','How did you hear about X Education','Country','What matters most to you in choosing a course'], axis=1)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3474 entries, 0 to 3473
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Lead Source                      3474 non-null   object 
 1   Converted                        3474 non-null   int64  
 2   TotalVisits                      3474 non-null   int64  
 3   Total Time Spent on Website      3474 non-null   int64  
 4   Page Views Per Visit             3474 non-null   float64
 5   Last Activity                    3474 non-null   object 
 6   Specialization                   3474 non-null   object 
 7   What is your current occupation  3474 non-null   object 
 8   Tags                             3474 non-null   object 
 9   Lead Quality                     3474 non-null   object 
 10  Lead Profile                     3474 non-null   object 
 11  Asymmetrique Activity Index      1943 non-null   object 
 12  Asymmetrique Profile

In [None]:
# Specialization columns has value as 'Select', we can consider this error as 'Other' category
dataset['Specialization'] = dataset['Specialization'].apply(lambda x:'Other' if x=='Select' else x)
dataset['Lead Profile'] = dataset['Lead Profile'].apply(lambda x:'Other' if x=='Select' else x)
dataset['Specialization'].value_counts()

Finance Management                   496
Other                                444
Marketing Management                 441
Human Resource Management            433
Operations Management                252
Business Administration              215
IT Projects Management               195
Supply Chain Management              184
Banking, Investment And Insurance    174
Travel and Tourism                   109
Media and Advertising                106
International Business                94
Healthcare Management                 81
Hospitality Management                60
Retail Management                     58
E-COMMERCE                            49
Rural and Agribusiness                36
E-Business                            30
Services Excellence                   17
Name: Specialization, dtype: int64

In [None]:
# Creating a list of object type columns
obj_cols = dataset.select_dtypes(np.object).columns.tolist()
obj_cols

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


['Lead Source',
 'Last Activity',
 'Specialization',
 'What is your current occupation',
 'Tags',
 'Lead Quality',
 'Lead Profile',
 'Asymmetrique Activity Index',
 'Asymmetrique Profile Index']

In [None]:
dataset = pd.get_dummies(data=dataset, columns=['Lead Source', 'Last Activity','Specialization','Tags','What is your current occupation'])

In [None]:
# Converting Categorical features into Numerical features
dataset['Lead Quality'] = dataset['Lead Quality'].map({'Might be':0,'Not Sure':1,'Worst':2,'Low in Relevance':3,'High in Relevance':4})
dataset['Lead Profile'] = dataset['Lead Profile'].map({'Other':0, 'Potential Lead':1, 'Other Leads':2, 'Student of SomeSchool': 3, 'Dual Specialization Student':4, 'Lateral Student':5})
dataset['Asymmetrique Activity Index'] = dataset['Asymmetrique Activity Index'].map({'Low':0, 'Medium':1, 'High':2})
dataset['Asymmetrique Profile Index'] = dataset['Asymmetrique Profile Index'].map({'Low':0, 'Medium':1, 'High':2})

                                           
print(dataset.info())









<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3474 entries, 0 to 3473
Data columns (total 88 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Converted                                               3474 non-null   int64  
 1   TotalVisits                                             3474 non-null   int64  
 2   Total Time Spent on Website                             3474 non-null   int64  
 3   Page Views Per Visit                                    3474 non-null   float64
 4   Lead Quality                                            3474 non-null   int64  
 5   Lead Profile                                            3474 non-null   int64  
 6   Asymmetrique Activity Index                             1943 non-null   float64
 7   Asymmetrique Profile Index                              1943 non-null   float64
 8   Asymmetrique Activity Score           

In [None]:
dataset.fillna(0, inplace=True)#Missing values replaced with 0
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3474 entries, 0 to 3473
Data columns (total 88 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Converted                                               3474 non-null   int64  
 1   TotalVisits                                             3474 non-null   int64  
 2   Total Time Spent on Website                             3474 non-null   int64  
 3   Page Views Per Visit                                    3474 non-null   float64
 4   Lead Quality                                            3474 non-null   int64  
 5   Lead Profile                                            3474 non-null   int64  
 6   Asymmetrique Activity Index                             3474 non-null   float64
 7   Asymmetrique Profile Index                              3474 non-null   float64
 8   Asymmetrique Activity Score           

In [None]:
# Dividing dataset into label and feature sets
X = dataset.drop(['Converted'], axis = 1) # Features
Y = dataset['Converted'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(3474, 87)
(3474,)


In [None]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Implementing Random Forest Classifier
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [250,300,350,400,450]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['Tags_Will revert after reading the email','Total Time Spent on Website','Lead Quality','Tags_Ringing','Tags_Already a student']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )
    ])
grid_param = {'classification__n_estimators': [450]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

print(pd.DataFrame.from_dict(gd_sr.cv_results_)) # Use this line of code if you want to look at cross-validation results for each split

################################################################################

{'classification__n_estimators': 450}
0.9557615282678935
Tags_Will revert after reading the email      2.713208e-01
Total Time Spent on Website                   1.247514e-01
Lead Quality                                  8.340102e-02
Tags_Ringing                                  7.295988e-02
Tags_Already a student                        3.256148e-02
                                                  ...     
Last Activity_View in browser link Clicked    6.159576e-06
Last Activity_Email Marked Spam               6.085705e-06
Last Activity_Email Received                  3.244627e-06
Lead Source_WeLearn                           8.860774e-08
Lead Source_testone                           0.000000e+00
Length: 87, dtype: float64
{'classification__n_estimators': 450}
0.8934724021381456
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       1.667948       0.02665         0.099557        0.003081   

  param_classification__n_estimators                                 params

In [None]:
# Implementing AdaBoost
# Tuning the AdaBoost parameter 'n_estimators' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', AdaBoostClassifier(random_state=1))
    ])
grid_param = {'classification__n_estimators': [1,2,3,4,5,10,20,30,40,50,100]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)

{'classification__n_estimators': 100}
0.9541281398430638
Total Time Spent on Website              0.33
Page Views Per Visit                     0.09
Lead Quality                             0.06
Asymmetrique Activity Score              0.06
TotalVisits                              0.05
                                         ... 
Specialization_Services Excellence       0.00
Last Activity_Email Link Clicked         0.00
Specialization_Travel and Tourism        0.00
Last Activity_Converted to Lead          0.00
Specialization_Hospitality Management    0.00
Length: 87, dtype: float64


In [None]:

# # ##################################################################################
# Implementing Support Vector Classifier
# Tuning the kernel parameter and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', SVC(random_state=1) )
    ])
grid_param = {'classification__kernel': ['linear','poly', 'rbf', 'sigmoid'], 'classification__C': [.001,.01,.1,1,10,100]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

##################################################################################
# Implementing Logistic Regression
# Tuning eta0, max_iter, alpha, and l1 ratio parameters and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', SGDClassifier(loss = 'log', penalty = 'elasticnet', random_state = 1))
    ])
grid_param = {'classification__eta0': [.001,.01,.1,1,10,100], 'classification__max_iter' : [100,500,1000], 'classification__alpha': [.001, .01,.1, 1,10,100], 'classification__l1_ratio': [0,0.3,0.5,0.7,1]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

# Implementing Multi-layer Perceptron (MLP)
# Tuning the MLP hyperparameters and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', MLPClassifier(hidden_layer_sizes = (50,), random_state=1)) # hidden_layer_sizes determines the number of layers and the number of units within those layers. It can be a hyperparameter
    ])
grid_param = {'classification__activation': ['logistic', 'tanh', 'relu'], 'classification__learning_rate_init': [.001,.01,.1,1,10,100], 'classification__max_iter' : [100,500,1000]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

"""
In the above GridSearchCV(), scoring parameter should be set as follows:
scoring = 'accuracy' when you want to maximize prediction accuracy
scoring = 'recall' when you want to minimize false negatives
scoring = 'precision' when you want to minimize false positives
scoring = 'f1' when you want to balance false positives and false negatives (place equal emphasis on minimizing both)
"""

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__C': 0.1, 'classification__kernel': 'poly'}
0.9819776358303182




{'classification__alpha': 1, 'classification__eta0': 0.001, 'classification__l1_ratio': 0.5, 'classification__max_iter': 100}
1.0




{'classification__activation': 'logistic', 'classification__learning_rate_init': 1, 'classification__max_iter': 100}
0.9552225249772934
