In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
pip install lightgbm==3.3.2

pip install projectpro

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
#suppress all warnings
warnings.filterwarnings("ignore")
from projectpro import checkpoint, feedback, show_video, save_point, preserve

In [None]:
#The maximum number of columns displayed when a frame is pretty-printed. By setting this limit we can see 200 columns at once without truncation.
pd.set_option('display.max_columns', 200)

# Introductory Video

In [None]:
show_video('"TXMFCf79-2g"')

In [None]:
data=pd.read_csv('Marketing_Data.csv')
data.head()

# Exploratory Data Analysis

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
#Data refining
data['Lead created']=pd.to_datetime(data['Lead created'],format="%d-%m-%Y %H:%M")
data['Next activity']=pd.to_datetime(data['Next activity'],format="%d-%m-%Y %H:%M")
data['Lead Last Update time']=pd.to_datetime(data['Lead Last Update time'],format="%d-%m-%Y %H:%M")
data['Demo Date']=pd.to_datetime(data['Demo Date'],format="%d-%m-%Y %H:%M")
checkpoint("fcMar1")

In [None]:
data.info()


In [None]:
data['Lead Owner'].unique()

In [None]:
data['Lead Owner'].value_counts()

In [None]:
data['Interest Level'].unique()

In [None]:
data['Interest Level'].value_counts()

In [None]:
data['What do you do currently ?'].unique()

In [None]:
data['What do you do currently ?'].value_counts()
#There are so many unique values

In [None]:
data['Creation Source'].unique()

In [None]:
data['Creation Source'].value_counts()

This feature looks well balanced in terms of unique values

In [None]:
data['What are you looking for in Product ?'].unique()

In [None]:
data['What are you looking for in Product ?'].value_counts()

This feature has a lot of unique values

In [None]:
data['Website Source'].unique()

In [None]:
data['Website Source'].value_counts()

This feature has moderate unique values

In [None]:
data['Marketing Source'].unique()

This is a lot of unique data, which might confuse the model

In [None]:
data['Demo Status'].unique()

In [None]:
data['Demo Status'].value_counts()

It would be a great addition in developing the model

In [None]:
data['Lead Location(Manual)'].value_counts()

This is a lot of unique data, which might confuse the model

# Data Processing & Feature engineering

In [None]:
data.info()

In [None]:
#Finding the missing data
print(f"Lead Owner:{data['Lead Owner'].isna().sum()}")
print(f"Interest Level:{data['Interest Level'].isna().sum()}")

In [None]:
data=data[data['Interest Level'].notna()]

With this mean, we will fill the NaN values.

In [None]:
data['Interest Level'].isna().sum()

In [None]:
data['Interest Level'].value_counts()

Now we will handle our target variable
Since there are multiple values in target variable and i want to formulate the problem as a binary classification problem, i will do the following assignments

Label assignment:

Slightly Interested = 1

Not Interested=0

No Answer=0

Fairly Interested=1

Very Interested=1

i will drop rows where value 'is Not called', 'Closed' and 'Invalid Number'

In [None]:
data=data[~data['Interest Level'].isin(['Closed','Invalid Number','Not called'])]

In [None]:
data['Interest Level'].value_counts()

In [None]:
data['Interest Level']=data['Interest Level'].apply(lambda x: 1 if x in ['Slightly Interested','Fairly Interested','Very Interested'] else 0)


In [None]:
data['Interest Level'].value_counts()

In [None]:
data=data.drop(["Lead Id", "Lead Location(Auto)", "Next activity", "What are you looking for in Product ?",
              "Lead Last Update time", "Lead Location(Manual)", "Demo Date", "Demo Status", "Closure date"],axis=1)

In [None]:
data.info()

In [None]:
#Data refining
data['Lead created']=pd.to_datetime(data['Lead created'],format="%d-%m-%Y %H:%M")
checkpoint("fcMar1")

In [None]:
data.info()

In [None]:
data['Lead created']

In [None]:
data['hour_of_the_day']=data['Lead created'].dt.hour
data['day_of_week'] = data['Lead created'].dt.weekday

In [None]:
data.info()

In [None]:
data['day_of_week']

In [None]:
data=data.drop('Lead created',axis=1)

In [None]:
data.info()

In [None]:
data['Creation Source'].value_counts()

In [None]:
from pandas import factorize

In [None]:
labels,categories=factorize(data['Creation Source'])

In [None]:
data['labels']=labels
abs(data['Interest Level'].corr(data['labels']))

This show a positive correlation with the target variable

In [None]:
data = data.drop(["labels"], axis=1)

In [None]:
data['What do you do currently ?'].value_counts()

This feature is dominated by students

BINARIZATION


student = 1
others = 0
As we saw earlier, this feature has a large number of values of which students are a dominating part.

We will binarize this column into students and non-students

Facts

Binarization is the process of dividing data into two groups and assigning one out. of two values to all the members of the same group. This is usually accomplished. by defining a threshold t and assigning the value 0 to all the data points below. the threshold and 1 to those above it.

In [None]:
data['What do you do currently ?'].isna().sum()

In [None]:
data['What do you do currently ?'].value_counts(normalize=1)

In [None]:
data['What do you do currently ?']=data['What do you do currently ?'].apply(lambda x: 1 if 'student' in str(x).strip().lower() else 0)

In [None]:
data['What do you do currently ?'].value_counts()

In [None]:
data['Website Source'].isna().sum()

In [None]:
data=data.drop('Website Source',axis=1)

In [None]:
data.info()

In [None]:
data['Marketing Source'].isna().sum()

Marketing Source has a large number of missing value and it will be noisy if we do an imputation here.

Rather, let's create a new value Unknown which will be substituted for NA values

In [None]:
data['Marketing Source'].fillna('Unknown',inplace=True)

In [None]:
data['Marketing Source'].value_counts()

 Imputation with Unknown led to improvements that dropping these rows

# Label Encoding

Transforming variables is an important step in the data preprocessing pipeline of machine learning, as it helps to convert the data into a format that is suitable for analysis and modeling. There are several ways to transform variables, depending on the type and nature of the data.

Categorical variables, for example, are variables that take on discrete values from a finite set of categories, such as colors, gender, or occupation. One common way to transform categorical variables is through one-hot encoding. One-hot encoding involves creating a new binary variable for each category in the original variable, where the value is 1 if the observation belongs to that category and 0 otherwise. This approach is useful when the categories have no natural order or ranking.

Another way to transform categorical variables is through label encoding. Label encoding involves assigning a unique integer value to each category in the variable. This approach is useful when the categories have a natural order or ranking, such as low, medium, and high. Transforming categorical features into numerical labels:

In [None]:
label_encoder1=preprocessing.LabelEncoder()

In [None]:
data['Marketing Source']=label_encoder1.fit_transform(data['Marketing Source'])
save_point("fcMar1")

In [None]:
label_encoder2=preprocessing.LabelEncoder()


In [None]:
data['Creation Source']=label_encoder2.fit_transform(data['Creation Source'])

In [None]:
label_encoder3=preprocessing.LabelEncoder()
data['Lead Owner']=label_encoder3.fit_transform(data['Lead Owner'])

In [None]:
data['Lead Owner'].value_counts()

In [None]:
data.head()

# Model Building and Testing

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_curve, plot_roc_curve, plot_precision_recall_curve
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
x=data[['Lead Owner','Creation Source','What do you do currently ?','Marketing Source','hour_of_the_day','day_of_week']]
y=data['Interest Level']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

Difference between Bagging and Boosting

Both the Ensemble techniques are used in a different way as well.  Bagging methods, for example, are often used on poor learners who have large variance and low bias such as decision trees because they tend to overfit, whereas boosting methods are employed when there is low variance and high bias. While bagging can help prevent overfitting, boosting methods are more vulnerable to it because of a simple fact they continue to build on weak learners and continue to minimise error. This can lead to overfitting on the training data but specifying a decent number of models to be generated or hyperparameter tuning,  regularization can help in this case, if overfitting encountered.

In [None]:
rf_model=RandomForestClassifier(n_estimators=300)
xgb_model=XGBClassifier(n_estimators=300,objective='binary:logistic',tree_method='hist',eta=0.1,max_depth=3)
lgb_model=LGBMClassifier(n_estimators=300)


In [None]:
rf_model.fit(x_train,y_train)
xgb_model.fit(x_train,y_train)
lgb_model.fit(x_train,y_train)

# Model Evaluation

In [None]:
def evaluate_model(model_name,model,prediction,actual):
    print('Accuracy of %s:'% model_name,accuracy_score(prediction,actual))

In [None]:
evaluate_model('random forest',rf_model,rf_model.predict(x_test),y_test)
evaluate_model('Xgboost',xgb_model,xgb_model.predict(x_test),y_test)
evaluate_model('LightGBM',lgb_model,lgb_model.predict(x_test),y_test)

In [None]:
plot_precision_recall_curve(rf_model, x_test, y_test)
plot_roc_curve(rf_model, x_test, y_test)

In [None]:
plot_precision_recall_curve(xgb_model, x_test, y_test)
plot_roc_curve(xgb_model, x_test, y_test)

In [None]:
plot_precision_recall_curve(lgb_model, x_test, y_test)
plot_roc_curve(lgb_model, x_test, y_test)

After looking at the PR and ROC curves above, we can conclude that LightGBM is giving us the best possible results