<a href="https://colab.research.google.com/github/KrishnaKakani-GitHub/Introduction-To-Github/blob/main/Project2_Lead_Conversion_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Library to split data
from sklearn.model_selection import train_test_split

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)

# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# setting the precision of floating numbers to 5 decimal points
pd.set_option("display.float_format", lambda x: "%.5f" % x)

# To build model for prediction
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [None]:
# To tune different models
from sklearn.model_selection import GridSearchCV

# To get different metric scores
import sklearn.metrics as metrics
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
    make_scorer,
)


In [None]:
# Importing my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing the Dataset

learn = pd.read_csv('ExtraaLearn.csv') # Complete the code to read the
dataset
data = pd.read_csv('ExtraaLearn.csv')
# copying data to another variable to avoid any changes to original
data
data = learn.copy()

In [None]:
# Viewing the first 5 rows of the dataset

data.head()

In [None]:
# Viewing the last 5 rows of the dataset

data.tail()

In [None]:
# Understand the shape of the data

data.shape

In [None]:
# Check the data types of the columns for the dataset

data.info

In [None]:
# Checking for duplicate values
data.duplicated()

## Exploratory Data Analysis

In [None]:
# Let's check the statistical summary of the data:

data.describe(include='all').T


In [None]:
# Making a list of all categorical variables

cat_col = list(data.select_dtypes("object").columns)

# Printing count of each unique value in each column
for column in cat_col:
    print(data[column].value_counts())
    print("-" * 50)

In [None]:
# checking the number of unique values
data["ID"].unique() # Complete the code to check the number of unique values

data.drop(["ID"], axis = 1, inplace = True)

In [None]:
# Univariate Analysis - function to plot a boxplot and a histogram along the same scale

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined
    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid = 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )

    # Creating the 2 subplots
    sns.boxplot(
        data=data,
        x=feature,
        ax=ax_box2,
        showmeans=True,
        color="violet"
    )

    # Boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data,
        x=feature,
        kde=kde,
        ax=ax_hist2,
        bins=bins,
        palette="winter"
    ) if bins else sns.histplot(
        data=data,
        x=feature,
        kde=kde,
        ax=ax_hist2
    )

    # For histogram
    ax_hist2.axvline(
        data[feature].mean(),
        color="green",
        linestyle="--"
    )

    # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(),
        color="black",
        linestyle="-"
    )

    # Add median to the histogram

# Example usage
histogram_boxplot(data, "age")


In [None]:
# Observations on wesbite_visits:

histogram_boxplot(data, "website_visits")

In [None]:
# To check how many leads have not visited web-site

data[data["website_visits"] == 0].shape

In [None]:
# Observations on number of time_spent_on_website

histogram_boxplot(data, "time_spent_on_website")

In [None]:
# Observations on number of page_views_per_visit

histogram_boxplot(data, "page_views_per_visit")

In [None]:
# Function to create labeled barplots

def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top
    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()

    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)

    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc is True:
            label = "{:.1f}%".format(100 * p.get_height() / total)  # percentage of each class
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )

    # Annotate the percentage
    plt.show()

In [None]:
# Observations on current_occupation

labeled_barplot(data, "current_occupation", perc=True)

In [None]:
# Observations on number of first_interaction

labeled_barplot(data, "first_interaction", perc=True) # Complete the
code to plot labeled_barplot for first_interaction

In [None]:
# Observations on number of profile_completed

labeled_barplot(data, "profile_completed", perc=True) # Complete the
code to plot labeled_barplot for profile_completed

In [None]:
# Observations on last_activity
labeled_barplot(data, "last_activity", perc=True) # Complete the code
to plot labeled_barplot for last_activity

In [None]:
# Observations on print_media_type1

labeled_barplot(data, "print_media_type1", perc=True) # Complete the
code to plot labeled_barplot for print_media_type1

In [None]:
# Observations on print_media_type2

labeled_barplot(data, "print_media_type2", perc=True) # Complete the
code to plot labeled_barplot for print_media_type2

In [None]:
# Observations on digital_media

labeled_barplot(data, "digital_media", perc=True) # Complete the code
to plot labeled_barplot for digital_media

In [None]:
# Observations on educational_channels

labeled_barplot(data, "educational_channels", perc=True) # Complete
the code to plot labeled_barplot for educational_channels

In [None]:
# Observations on referral

labeled_barplot(data, "referral", perc=True) # Complete the code to
plot labeled_barplot for referral

In [None]:
# Observations on status

labeled_barplot(data, "status", perc=True) # Complete the code to plot
labeled_barplot for status

In [None]:
# Bivariate Analysis

cols_list = data.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(12, 7))
sns.heatmap(
    data[cols_list].corr(),
    annot=True,
    vmin=-1,
    vmax=1,
    fmt=".2f",
    cmap="Spectral"
)
plt.show()


In [None]:
# Creating functions that will help us with further analysis

### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
 fig, axs = plt.subplots(2, 2, figsize=(12, 10))
 target_uniq = data[target].unique()
 axs[0, 0].set_title("Distribution of target for target=" +
str(target_uniq[0]))
 sns.histplot(
 data=data[data[target] == target_uniq[0]],
 x=predictor,
 kde=True,
 ax=axs[0, 0],
 color="teal",
 stat="density",
 )
 axs[0, 1].set_title("Distribution of target for target=" +
str(target_uniq[1]))

 sns.histplot(
 data=data[data[target] == target_uniq[1]],
 x=predictor,
 kde=True,
 ax=axs[0, 1],
 color="orange",
 stat="density",
 )
 axs[1, 0].set_title("Boxplot w.r.t target")
 sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0],
palette="gist_rainbow")
 axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
 sns.boxplot(
 data=data,
 x=target,
 y=predictor,
 ax=axs[1, 1],
 showfliers=False,
 palette="gist_rainbow",
 )
 plt.tight_layout()
 plt.show()

def stacked_barplot(data, predictor, target):
 """
 Print the category counts and plot a stacked bar chart
 data: dataframe
 predictor: independent variable
 target: target variable
 """
 count = data[predictor].nunique()
 sorter = data[target].value_counts().index[-1]
 tab1 = pd.crosstab(data[predictor], data[target],
margins=True).sort_values(
 by=sorter, ascending=False
 )
 print(tab1)
 print("-" * 120)
 tab = pd.crosstab(data[predictor], data[target],
normalize="index").sort_values(
 by=sorter, ascending=False
 )
 tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
 plt.legend(
 loc="lower left", frameon=False,
 )

 plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
 plt.show()




In [None]:
# Age can be a good factor to differentiate between such leads

plt.figure(figsize=(10, 5))
sns.boxplot(data = data, x = data["current_occupation"], y =
data["age"])
plt.show()


In [None]:
data.groupby(["current_occupation"])["age"].describe()


In [None]:
# The company's first interaction with leads should be compelling and persuasive. Let's see if the channels of the first interaction have an impact on the conversion of leads

stacked_barplot(data, "first_interaction", "status")

In [None]:
distribution_plot_wrt_target(data, "time_spent_on_website", "status")

In [None]:
# checking the median value
data.groupby(["status"])["time_spent_on_website"].median()

In [None]:
distribution_plot_wrt_target(data, "website_visits", "status") #

In [None]:
distribution_plot_wrt_target(data, "page_views_per_visit", "status")

In [None]:
stacked_barplot(data, "profile_completed", "status")

In [None]:
stacked_barplot(data, "last_activity", "status")

In [None]:
stacked_barplot(data, "print_media_type1", "status")


In [None]:
stacked_barplot(data, "print_media_type2", "status")

In [None]:
stacked_barplot(data, "digital_media", "status")

In [None]:
stacked_barplot(data, "educational_channels", "status")

In [None]:
stacked_barplot(data, "referral", "status")

In [None]:
# Outlier Check

# Outlier detection using boxplot
numeric_columns = data.select_dtypes(include=np.number).columns.tolist()

# Dropping 'status' as it is a temporal variable
numeric_columns.remove("status")

plt.figure(figsize=(15, 12))

for i, variable in enumerate(numeric_columns):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

In [None]:
# Data Preparation for modeling

X = data.drop(["status"], axis=1)
Y = data["status"]  # Complete the code to define the dependent (target) variable

X = pd.get_dummies(X, drop_first=True)  # Complete the code to get dummies for X

# Splitting the data in 70:30 ratio for train to test data
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=1
)

print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)

print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))

print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))


In [None]:
# Building Classification Models

# Function to print the classification report and get confusion matrix in a proper format
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)

    plt.figure(figsize=(8, 5))
    sns.heatmap(
        cm,
        annot=True,
        fmt='.2f',
        xticklabels=['Not Converted', 'Converted'],
        yticklabels=['Not Converted', 'Converted']
    )

    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


In [None]:
# Building Decision Tree Model

# Fitting the decision tree classifier on the training data
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, y_train)

# Making predictions on the training data
prediction = d_tree.predict(X_train)


In [None]:
# Checking model performance on training set

# Checking performance on the training data
y_pred_train1 = y_train
metrics_score(y_train, prediction)

In [None]:
# Let's check the performance on test data to see if the model is overfitting.

# Checking performance on the testing data
prediction_test = d_tree.predict(X_test)
metrics_score(y_test, prediction_test)

In [None]:
# Decision Tree - Hyperparameter Tuning

# Choose the type of classifier
d_tree_tuned = DecisionTreeClassifier(
    random_state=7,
    class_weight={0: 0.3, 1: 0.7}
)

# Grid of parameters to choose from
parameters = {
    'max_depth': np.arange(2, 10),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [5, 10, 20, 25]
}

# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label=1)

# Run the grid search
grid_obj = GridSearchCV(d_tree_tuned, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
d_tree_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data
d_tree_tuned.fit(X_train, y_train)

In [None]:
# Let's check the model performance on the testing data

# Checking performance on the testing data
y_pred_test2 = d_tree_tuned.predict(X_test)


In [None]:
# Visualizing the Decision Tree

# features = list(X.columns)
plt.figure(figsize = (20, 20))
tree.plot_tree(d_tree_tuned, feature_names = features, filled = True,
fontsize = 9, node_ids = True, class_names = True)
plt.show()

In [None]:
# Looking at Feature Importance

# Importance of features in the tree building
print(
    pd.DataFrame(
        d_tree_tuned.feature_importances_,
        columns=["Imp"],
        index=X_train.columns
    ).sort_values(by='Imp', ascending=False)
)

# Plotting the feature importance
importances = d_tree_tuned.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 10))
plt.title('Feature Importances')
plt.barh(
    range(len(indices)),
    importances[indices],
    color='violet',
    align='center'
)
plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# Random Forest Classifier

# Fitting the random forest tree classifier on the training data
rf_estimator = RandomForestClassifier()
rf_estimator.fit(X_train, y_train)
RandomForestClassifier()


In [None]:
# Checking performance on the training data
y_pred_train3 = rf_estimator.predict(X_train)

In [None]:
# Checking performance on the testing data
y_pred_test3 = rf_estimator.predict(X_test)
metrics_score(y_test, y_pred_test3)


In [None]:
# Random Forest Classifier - Hyperparameter Tuning

# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(
    criterion="entropy",
    random_state=7
)

# Grid of parameters to choose from
parameters = {
    "n_estimators": [110, 120],
    "max_depth": [6, 7],
    "min_samples_leaf": [20, 25],
    "max_features": [0.8, 0.9],
    "max_samples": [0.9, 1],
    "class_weight": ["balanced", {0: 0.3, 1: 0.7}]
}

# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label=1)

# Run the grid search on the training data using scorer=scorer and cv=5
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)

# Save the best estimator to variable rf_estimator_tuned
rf_estimator_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data
rf_estimator_tuned.fit(X_train, y_train)

# Fitting the best algorithm to the training data
rf_estimator_tuned.fit(X_train, y_train)

# Checking performance on the training data
y_pred_train4 = rf_estimator_tuned.predict(X_train)

In [None]:
# Let's check the model performance on the test data

# Checking performance on the test data
y_pred_train4 = rf_estimator_tuned.predict(X_test)

In [None]:
importances = rf_estimator_tuned.feature_importances_
indices = np.argsort(importances)
feature_names = list(X.columns)

plt.figure(figsize=(12, 12))
plt.title('Feature Importances')
plt.barh(
    range(len(indices)),
    importances[indices],
    color='violet',
    align='center'
)
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
