## IMPORT

In [None]:
#for data analysis and visualization
import pandas as pd
import numpy as np
%matplotlib inline 

#for model creation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,  ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,  ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import RFE
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


In [None]:
#concating 2 files to use test/train split

data = "https://raw.githubusercontent.com/HaniehRJP/Final-Project/main/customer_conversion_traing_dataset%20.csv"
df = pd.read_csv(data)
df_test_link='https://raw.githubusercontent.com/HaniehRJP/Final-Project/main/customer_conversion_testing_dataset.csv'
df_test=pd.read_csv(df_test_link)
df=pd.concat([df, df_test], axis=0)

In [None]:
df.head(10)

## CLEANING

In [None]:
#NAs

def null_check(data_frame):
    print(f'Total null values per row: \n{data_frame.isnull().sum(axis=1)}\n')
    print(f'Total null values per column: \n{data_frame.isnull().sum()}\n')

null_check(df)

In [None]:
#Duplicates check

def dup_check(data_frame):
    print(f'Duplicates found: {data_frame.duplicated().any()}\n')
    print(f'Number of duplicates: {data_frame.duplicated().sum()}\n')
    
dup_check(df)

In [None]:
#colmns optimization

#renaming
df.rename(columns={"TimeSpent (minutes)":"TimeSpent"}, inplace=True)
df.rename(columns={"Conversion (Target)":"Conversion"}, inplace=True)

#recalculating
df['ResponseTime'] = df['ResponseTime (hours)'] * 60

# Drop the original 'ResponseTime (hours)' column
df.drop(columns=['ResponseTime (hours)', 'Location', 'ReferralSource', 'LeadStatus', 'LeadID'], inplace=True)

In [None]:
df.to_csv('df.csv', index=False)

## ENCODING

In [None]:
categoricals = df.select_dtypes(include=[object, bool])
categoricals.head(3)

In [None]:
numericals = df.select_dtypes(include = 'number')
numericals.head(3)

In [None]:
ordinals = categoricals[['PaymentHistory']]
ordinals.head(3)

In [None]:
ordinals = categoricals['PaymentHistory'].map({'Good':1, 'No Payment':0})

In [None]:
nominals = categoricals.drop(columns=['PaymentHistory'])

In [None]:
nominals.head(3)

In [None]:
#making dunnies from nominals

nominals = pd.get_dummies(nominals, dtype=int)
nominals.head()

In [None]:
categoricals = pd.concat([ordinals, nominals], axis=1)
categoricals.head()

In [None]:
df = pd.concat([categoricals, numericals], axis=1)
df.head(3)

## TRAIN TEST SPLIT

In [None]:
#splitting into X and y
X=df.drop(columns=["Conversion"])
y=df[["Conversion"]]

In [None]:
#train and test split
X_train, X_test,y_train, y_test=train_test_split(X,y, random_state=42)

## DOWNSAMPLING

In [None]:
df_train=pd.concat([X_train, y_train], axis=1)
df_train

In [None]:
#downsampling
category_0 = df_train[df_train['Conversion'] == 0] # negative class (majority)
category_1 = df_train[df_train['Conversion'] == 1] # positive class (minority)

print(category_0['Conversion'].value_counts())
print(category_1['Conversion'].value_counts())

In [None]:
c1_len=5000
category_0_down = category_0.sample(c1_len)
print(category_0_down.shape)
print(category_1.shape)

In [None]:
# reassemble the data
df_train = pd.concat([category_0_down, category_1], axis=0)
# shuffle the data
df_train = df_train.sample(frac=1) # frac specifies ratio of the shuffled output to the input size. for frac=1 the number of rows is unchanged
df_train['Conversion'].value_counts()

In [None]:
df_train

In [None]:
X_train=df_train.drop(columns=["Conversion"])
y_train=df_train[["Conversion"]]

## UPSAMPLE USING SMOTE

In [None]:
# Upsampling using SMOTE
smote = SMOTE()

In [None]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
y_train_sm.value_counts()

## SCALING

In [None]:
#scaling
pt=PowerTransformer()
pt.fit(X_train)
X_train_pt=pt.transform(X_train_sm)
X_test_pt=pt.transform(X_test)

## LOGISTICS REGRESSION

In [None]:
#fitting LOGISTICS REGRESSION

weights = {0: 1, 1: 2}
log_model = LogisticRegression(class_weight=weights) # weights to be added if we want to avoid fall negative or fall positive
log_model.fit(X_train_pt, y_train_sm)

# predicting data
y_pred_train = log_model.predict(X_train_pt)
y_pred_test = log_model.predict(X_test_pt)


#results
performance_log1 = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train_sm, y_pred_train),
                                         precision_score(y_train_sm, y_pred_train),
                                         recall_score(y_train_sm, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test),
                                        recall_score(y_test, y_pred_test)]})

display(performance_log1)

In [None]:
performance_log1.to_csv('KNN.csv', index=False)

## TRAIN SET CONFUSION MATRIX

In [None]:
cm_train = confusion_matrix(y_train_sm, y_pred_train, labels = log_model.classes_)
disp = ConfusionMatrixDisplay(cm_train)
disp.plot()

## TEST SET CONFUSION MATRIX

In [None]:
cm_train = confusion_matrix(y_test, y_pred_test, labels = log_model.classes_)
disp = ConfusionMatrixDisplay(cm_train)
disp.plot()

## Feature selection

In [None]:
# Filtering out DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

accuracy = {}  # Dictionary to store best accuracies
precision = {} # Dictionary to store best precision
recall={} # Dictionary to store best recalls

# Iterating over different numbers of selected features
for num_features in range(10, 21):  # Considering up to 10 features
    
    selector = RFE(log_model, n_features_to_select=num_features, step=1, verbose=1)
    selector.fit(X_train_pt, y_train_sm)
    
    weights = {0: 1, 1: 2}
    log_model = LogisticRegression(class_weight=weights)
    
    # Transforming the data
    X_train_RFE = selector.transform(X_train_pt)
    X_test_RFE = selector.transform(X_test_pt)
    
    # Fitting the model
    log_model.fit(X_train_RFE, y_train_sm)
    
    # Predicting and calculating R2 score
    y_pred_test= log_model.predict(X_test_RFE)
    acc=accuracy_score(y_test, y_pred_test)
    rec=recall_score(y_test, y_pred_test)
    prec=precision_score(y_test, y_pred_test)
    
    # Storing the best R2 score for each number of selected features
    accuracy[num_features] = acc
    precision[num_features]=prec
    recall[num_features]=rec

# Printing the best R2 score for each number of selected features
for num_features, acc in accuracy.items():
    prec = precision[num_features]
    rec = recall[num_features]
    print(f"Number of Features: {num_features}, Accuracy: {acc}, Precision: {prec}, Recall: {rec}")

### best decision according to RFE is keeping all 20 features
##### Accuracy: 0.750642102926721, Precision: 0.05739381542533991, Recall: 0.9408284023668639

## LASSO, RIDGE, ELASTICNET

- LASSO

In [None]:

# Initializing Logistic Regression model with Lasso regularization
log_model_lasso = LogisticRegression(penalty='l1', solver='liblinear', class_weight=weights)

# Fitting the model to your data
log_model_lasso.fit(X_train_pt, y_train_sm)

# Predicting on the test set
y_pred_lasso = log_model_lasso.predict(X_test_pt)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_lasso)
precision = precision_score(y_test, y_pred_lasso)
recall = recall_score(y_test, y_pred_lasso)

# Printing evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

- RIDGE

In [None]:

# Initializing Logistic Regression model with Lasso regularization
log_model_ridge = LogisticRegression(penalty='l2', solver='liblinear', class_weight=weights)

# Fitting the model to your data
log_model_ridge.fit(X_train_pt, y_train_sm)

# Predicting on the test set
y_pred_ridge = log_model_ridge.predict(X_test_pt)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_ridge)
precision = precision_score(y_test, y_pred_ridge)
recall = recall_score(y_test, y_pred_ridge)

# Printing evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

- ELASTICNET

In [None]:

# Initializing Logistic Regression model with Lasso regularization
log_model_elasticnet = LogisticRegression(penalty='elasticnet', solver='saga',l1_ratio=0.5, class_weight=weights)

# Fitting the model to your data
log_model_elasticnet.fit(X_train_pt, y_train_sm)

# Predicting on the test set
y_pred_elasticnet = log_model_elasticnet.predict(X_test_pt)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_elasticnet)
precision = precision_score(y_test, y_pred_elasticnet)
recall = recall_score(y_test, y_pred_elasticnet)

# Printing evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

## Fit a KNN Classifier model 

In [None]:
# Instantiating KNN Classifier model

model = KNeighborsClassifier()

### Optimizing the Weighting Scheme for K-Nearest Neighbors (KNN)

In [None]:
# Defining a custom weight function

def custom_weight_function(distances):
    
    #Giving more weight to class 1
    
    weights = np.where(distances == 0, 5, 0.5)
    return weights

### Implementing Grid Searching for Number of Neighbors Optimization in KNN

In [None]:
# Defining the parameter grid for grid search and get the best model and fit the model

param_grid = {
    'n_neighbors': list(range(2, 21)),  # Range from 2 to 20
    'weights': [custom_weight_function, 'uniform', 'distance']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall')

# Fit the model
grid_search.fit(X_train_pt, y_train_sm)  # Assuming X_train and y_sm are your training data

# Print the best parameters
print("Best parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Now, best_model is the KNeighborsClassifier with the best hyperparameters
# You can use it for predictions on the test set
y_pred = best_model.predict(X_test)  # Assuming X_test is your test data


# Fit the model
model.fit(X_train_pt, y_train_sm)

In [None]:
# Instantiate KNN Classifier model
model = KNeighborsClassifier(n_neighbors=19,weights='distance')

# Fit the model
model.fit(X_train_pt, y_train_sm)


### Predicting on the test and train set

In [None]:

y_pred = model.predict(X_test) # predict test
y_pred_train=model.predict(X_train_pt) # predict train (for sanity checks)

# Evaluate performance
performance_log = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train_sm, y_pred_train),
                                         precision_score(y_train_sm, y_pred_train),
                                         recall_score(y_train_sm, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test),
                                        recall_score(y_test, y_pred_test)]})

display(performance_log)

In [None]:
performance_log.to_csv('KNN.csv', index=False)

### TRAIN SET CONFUSION MATRIX

In [None]:
cm = confusion_matrix(y_train_sm, y_pred_train, labels= model.classes_)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

### TEST SET CONFUSION MATRIX

In [None]:
cm = confusion_matrix(y_test, y_pred_test, labels = model.classes_)
disp = ConfusionMatrixDisplay(cm)
disp.plot()

## RandomForest Model

In [None]:
# for reproducible shuffling
RAND_STATE = 42

# test/train
TT_RATIO = 0.25 

In [None]:

rfc_ops = {"max_depth":6,
           "min_samples_leaf":20,
           "n_estimators":100,
           "bootstrap":True,
           "oob_score":True,
           "random_state":RAND_STATE}

class_weights = {0: 1, 1: 2}  # Adjust weights as needed
clf = RandomForestClassifier(class_weight=class_weights, **rfc_ops)

clf.fit(X_train_sm, y_train_sm)
print("train prediction accuracy score: %.2f" %(clf.score(X_train_sm, y_train_sm)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))

In [None]:
# Utilizing Out-of-Bag Score for Performance Evaluation in RandomForestClassifier
clf.oob_score_

In [None]:
#calculating the accuracy score of the model on the test set
score_ds = accuracy_score(y_test,clf.predict(X_test))
score_ds

### Investigating Hyperparameter Tuning with Grid Search and Evaluating Results

In [None]:
# Creating the GridSearchCV object
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': list(range(2, 12)), 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1)

# Fitting the model
grid_search.fit(X_train_sm,y_train_sm)  # Assuming X_train and y_sm are your training data

# Printing the best parameters
print("Best parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# predicting on the test set
y_pred = best_model.predict(X_test)  # Assuming X_test is your test data


# Fit the model
clf.fit(X_train_sm, y_train_sm)

### Fitting a RandomForest Classifier with Optimized Max Depth

In [None]:
rfc_ops = {"max_depth":11,
           "min_samples_leaf":1,
           'min_samples_split': 2,
           "n_estimators":100,
           "oob_score":True,
           "random_state":42}

class_weights = {0: 2.5, 1: 1}  # Adjust weights as needed
clf = RandomForestClassifier(class_weight=class_weights, **rfc_ops)

        #max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
         #                    bootstrap=True,oob_score=True, random_state=RAND_STATE)
clf.fit(X_train_sm, y_train_sm)
print("train prediction accuracy score: %.2f" %(clf.score(X_train_sm, y_train_sm)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))


### Predicting on the test and train set

In [None]:
# Predictions on the test set


y_pred = clf.predict(X_test) # predict test
y_pred_train=clf.predict(X_train_sm) # predict train (for sanity checks)

# Evaluate performance
performance_log2 = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train_sm, y_pred_train),
                                         precision_score(y_train_sm, y_pred_train),
                                         recall_score(y_train_sm, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred),
                                        precision_score(y_test, y_pred),
                                        recall_score(y_test, y_pred)]})

display(performance_log2)

In [None]:
performance_log2.to_csv('RandomForest.csv', index=False)

### Visualizing Accuracy and Recall Performance Based on Max Depth

In [None]:
cm = confusion_matrix(y_test, y_pred_test, labels = clf.classes_)
disp = ConfusionMatrixDisplay(cm)
disp.plot()