In [19]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression

In [2]:
#Import sales data
adidas_sales_df = pd.read_csv('../adidas_sales.csv')
adidas_sales_df.head()

Unnamed: 0,Retailer_ID,Invoice_Date,Region_ID,State_ID,Product_ID,Price_per_unit,Units_sold,Operating_margin,Sales_method,Total_sales,Operating_profit
0,RTL1,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0
1,RTL1,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0
2,RTL1,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0
3,RTL1,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5
4,RTL1,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0


In [3]:
adidas_sales_df.dtypes

Retailer_ID          object
Invoice_Date         object
Region_ID            object
State_ID             object
Product_ID           object
Price_per_unit      float64
Units_sold            int64
Operating_margin    float64
Sales_method         object
Total_sales         float64
Operating_profit    float64
dtype: object

In [6]:
#Convert date to datetime
adidas_sales_df['Invoice_Date'] = pd.to_datetime(adidas_sales_df['Invoice_Date'])

In [7]:
adidas_sales_df['Operating_margin'].value_counts()

0.35    1309
0.40    1003
0.30     722
0.50     610
0.45     364
        ... 
0.48       1
0.27       1
0.17       1
0.65       1
0.17       1
Name: Operating_margin, Length: 110, dtype: int64

In [8]:
#Add binary column for operating margin above 0.35
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating_margin'] >= 0.35

#Convert boolean to int
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating Margin Equal to or Above 0.35'].astype(int)

adidas_sales_df.head()

Unnamed: 0,Retailer_ID,Invoice_Date,Region_ID,State_ID,Product_ID,Price_per_unit,Units_sold,Operating_margin,Sales_method,Total_sales,Operating_profit,Operating Margin Equal to or Above 0.35
0,RTL1,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0,1
1,RTL1,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0,0
2,RTL1,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0,1
3,RTL1,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5,1
4,RTL1,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0,0


In [9]:
#Define feature set
X = adidas_sales_df.drop(['Operating_margin', 'Operating Margin Equal to or Above 0.35'], axis=1)

#Define target set
y = adidas_sales_df['Operating Margin Equal to or Above 0.35']

In [10]:
#Get dummy variables for categorical data
X = pd.get_dummies(X)

In [11]:
X.columns

Index(['Invoice_Date', 'Price_per_unit', 'Units_sold', 'Total_sales',
       'Operating_profit', 'Retailer_ID_RTL1', 'Retailer_ID_RTL2',
       'Retailer_ID_RTL3', 'Retailer_ID_RTL4', 'Retailer_ID_RTL5',
       'Retailer_ID_RTL6', 'Region_ID_RG1', 'Region_ID_RG2', 'Region_ID_RG3',
       'Region_ID_RG4', 'Region_ID_RG5', 'State_ID_ST1', 'State_ID_ST10',
       'State_ID_ST11', 'State_ID_ST12', 'State_ID_ST13', 'State_ID_ST14',
       'State_ID_ST15', 'State_ID_ST16', 'State_ID_ST17', 'State_ID_ST18',
       'State_ID_ST19', 'State_ID_ST2', 'State_ID_ST20', 'State_ID_ST21',
       'State_ID_ST22', 'State_ID_ST23', 'State_ID_ST24', 'State_ID_ST25',
       'State_ID_ST26', 'State_ID_ST27', 'State_ID_ST28', 'State_ID_ST29',
       'State_ID_ST3', 'State_ID_ST30', 'State_ID_ST31', 'State_ID_ST32',
       'State_ID_ST33', 'State_ID_ST34', 'State_ID_ST35', 'State_ID_ST36',
       'State_ID_ST37', 'State_ID_ST38', 'State_ID_ST39', 'State_ID_ST4',
       'State_ID_ST40', 'State_ID_ST41', 'State

In [12]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
#Create a StandardScaler instance
scaler = StandardScaler()

#Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train.drop(['Invoice_Date'], axis=1))

#Scale the data
X_train_scaled = X_scaler.transform(X_train.drop(['Invoice_Date'], axis=1))
X_test_scaled = X_scaler.transform(X_test.drop(['Invoice_Date'], axis=1))

# Decision Tree Regression

In [15]:
#Create decision tree classifier instance
dtr = tree.DecisionTreeClassifier()

#Fit the model
dtr = dtr.fit(X_train_scaled, y_train)

#Make predictions
dtr_predictions = dtr.predict(X_test_scaled)

In [16]:
#Calculate the confusion matrix
dtr_cm = confusion_matrix(y_test, dtr_predictions)

#Create a DataFrame from the confusion matrix
dtr_cm_df = pd.DataFrame(dtr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

dtr_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,340,55
Actual 1,52,1965


In [17]:
#Calculate the accuracy score
dtr_acc_score = accuracy_score(y_test, dtr_predictions)
dtr_acc_score

0.9556384742951907

In [18]:
#Calculate the classification report
print(classification_report(y_test, dtr_predictions))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       395
           1       0.97      0.97      0.97      2017

    accuracy                           0.96      2412
   macro avg       0.92      0.92      0.92      2412
weighted avg       0.96      0.96      0.96      2412



## Optimizing the Decision Tree Regression model using Grid Search

In [20]:
#Find optimal parameters using GridSearchCV
dtr_grid = {'criterion': ['gini', 'entropy'],
            'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
            'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}

#Create the GridSearchCV object
CV_dtr = GridSearchCV(estimator=dtr, param_grid=dtr_grid, cv=5, n_jobs=4)
CV_dtr.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18,
                                               20]})

In [21]:
#Show the optimal parameters
CV_dtr.best_params_

{'criterion': 'entropy',
 'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [75]:
#Use the past accuracy score to find the best parameters
opt_dtr_acc_score_loop = 0.9556

#For loop to find the best parameters
for c in dtr_grid['criterion']:
    for d in dtr_grid['max_depth']:
        for msl in dtr_grid['min_samples_leaf']:
            for mss in dtr_grid['min_samples_split']:
                dtr = tree.DecisionTreeClassifier(criterion=c, max_depth=d, min_samples_leaf=msl, min_samples_split=mss)
                dtr = dtr.fit(X_train_scaled, y_train)
                dtr_predictions = dtr.predict(X_test_scaled)
                dtr_acc_score_in_loop = accuracy_score(y_test, dtr_predictions)
                
                #If the accuracy score is better than the previous one, save the parameters
                if dtr_acc_score_in_loop > opt_dtr_acc_score_loop:
                    print(f'Criterion: {c}, Max Depth: {d}, Min Samples Leaf: {msl}, Min Samples Split: {mss}, Accuracy Score: {dtr_acc_score_in_loop}')

Criterion: entropy, Max Depth: 20, Min Samples Leaf: 1, Min Samples Split: 2, Accuracy Score: 0.9589552238805971
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 1, Min Samples Split: 4, Accuracy Score: 0.9564676616915423
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 1, Min Samples Split: 6, Accuracy Score: 0.9564676616915423
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 1, Min Samples Split: 8, Accuracy Score: 0.9564676616915423
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 2, Min Samples Split: 2, Accuracy Score: 0.9601990049751243
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 2, Min Samples Split: 4, Accuracy Score: 0.9581260364842454
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 2, Min Samples Split: 6, Accuracy Score: 0.9577114427860697
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 2, Min Samples Split: 8, Accuracy Score: 0.956882255389718
Criterion: entropy, Max Depth: 20, Min Samples Leaf: 2, Min Samples Split: 10, Accuracy Score: 0.

In [22]:
#For loop accuracy score was not better than the GridSearchCV accuracy score

#Create decision tree classifier instance
opt_dtr = tree.DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=1, min_samples_split=2)

#Fit the model
opt_dtr = opt_dtr.fit(X_train_scaled, y_train)

#Make predictions
opt_dtr_predictions = opt_dtr.predict(X_test_scaled)

In [30]:
#Calculate the confusion matrix
opt_dtr_cm = confusion_matrix(y_test, opt_dtr_predictions)

#Create a DataFrame from the confusion matrix
opt_dtr_cm_df = pd.DataFrame(opt_dtr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print(f'Optimized DTR Confusion Matrix:\n', opt_dtr_cm_df)
print(f'Original DTR Confusion Matrix:\n', dtr_cm_df)

Optimized DTR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          334           61
Actual 1           39         1978
Original DTR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          340           55
Actual 1           52         1965


In [28]:
#Calculate the accuracy score
opt_dtr_acc_score = accuracy_score(y_test, opt_dtr_predictions)

print(f'Optimized DTR Accuracy Score: {opt_dtr_acc_score}')
print(f'Original DTR Accuracy Score: {dtr_acc_score}')

Optimized DTR Accuracy Score: 0.9585406301824212
Original DTR Accuracy Score: 0.9556384742951907


In [27]:
#Calculate the classification report
print(f'Optimized DTR Report:\n', classification_report(y_test, opt_dtr_predictions))
print(f'Original DTR Report:\n', classification_report(y_test, dtr_predictions))

Optimized DTR Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.87       395
           1       0.97      0.98      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.93      0.91      0.92      2412
weighted avg       0.96      0.96      0.96      2412

Original DTR Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.86       395
           1       0.97      0.97      0.97      2017

    accuracy                           0.96      2412
   macro avg       0.92      0.92      0.92      2412
weighted avg       0.96      0.96      0.96      2412



# Random Forest Regression

In [31]:
#Create a random forest classifier
rfr = RandomForestClassifier(n_estimators=128, random_state=78)

In [32]:
#Fit the model
rfr = rfr.fit(X_train_scaled, y_train)

In [33]:
#Make predictions
rfr_predictions = rfr.predict(X_test_scaled)

In [34]:
#Calculate the confusion matrix
rfr_cm = confusion_matrix(y_test, rfr_predictions)

#Create a DataFrame from the confusion matrix
rfr_cm_df = pd.DataFrame(rfr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

rfr_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,328,67
Actual 1,24,1993


In [35]:
#Calculate the accuracy score
rfr_acc_score = accuracy_score(y_test, rfr_predictions)
rfr_acc_score

0.9622719734660033

In [36]:
#Calculate the classification report
print(classification_report(y_test, rfr_predictions))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88       395
           1       0.97      0.99      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.95      0.91      0.93      2412
weighted avg       0.96      0.96      0.96      2412



In [37]:
#Calculate feature importance in the Random Forest model
importances = rfr.feature_importances_
sorted(zip(rfr.feature_importances_, X.columns), reverse=True)

[(0.15182349810194812, 'Total_sales'),
 (0.1166484709025738, 'Units_sold'),
 (0.09271771142109246, 'Price_per_unit'),
 (0.08824188673452729, 'Invoice_Date'),
 (0.052018284757655485, 'Sales_method_In-store'),
 (0.035489596532471315, 'Product_ID_PD5'),
 (0.025105200956083863, 'Product_ID_PD4'),
 (0.024887412679273607, 'Product_ID_PD6'),
 (0.02407410711942072, 'Product_ID_PD1'),
 (0.020070531804078546, 'Sales_method_Online'),
 (0.01990324474205491, 'Region_ID_RG2'),
 (0.019737091458378354, 'State_ID_ST47'),
 (0.018166818963153072, 'State_ID_ST9'),
 (0.017907019936051805, 'Product_ID_PD3'),
 (0.01789342356537714, 'Product_ID_PD2'),
 (0.016444274771791206, 'Region_ID_RG1'),
 (0.01494218140624257, 'Retailer_ID_RTL2'),
 (0.014774038888097845, 'State_ID_ST7'),
 (0.014142105923244405, 'State_ID_ST16'),
 (0.013473564653812678, 'Operating_profit'),
 (0.011430618380322914, 'Retailer_ID_RTL6'),
 (0.010412816425545948, 'Retailer_ID_RTL3'),
 (0.010373597200210997, 'State_ID_ST8'),
 (0.010178674670904

## Optimizing the Random Forest Regression model using Grid Search

In [41]:
#Find optimal parameters using GridSearchCV
rfr_grid = {'criterion': ['gini', 'entropy'],
            'n_estimators': [100, 300, 500, 700],
            'max_depth': [2, 6, 10, 14, 18],
            'min_samples_leaf': [1, 3, 5, 7, 9],
            'min_samples_split': [2, 6, 10, 14],
            'max_features': ['auto', 'sqrt', 'log2']}

#Create the GridSearchCV object
CV_rfr = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rfr_grid, cv=5, n_jobs=4)
CV_rfr.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 6, 10, 14, 18],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 3, 5, 7, 9],
                         'min_samples_split': [2, 6, 10, 14],
                         'n_estimators': [100, 300, 500, 700]})

In [42]:
#Show the optimal parameters
CV_rfr.best_params_

{'criterion': 'gini',
 'max_depth': 18,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 700}

In [78]:
#Use the past accuracy score to find the best parameters
rfr_acc_score_loop = 0.9622

#For loop to find the best parameters
for c in rfr_grid['criterion']:
    for n in rfr_grid['n_estimators']:
        for d in rfr_grid['max_depth']:
            for msl in rfr_grid['min_samples_leaf']:
                for mss in rfr_grid['min_samples_split']:
                    for mf in rfr_grid['max_features']:
                        rfr = RandomForestClassifier(criterion=c, n_estimators=n, max_depth=d, min_samples_leaf=msl, min_samples_split=mss, max_features=mf)
                        rfr = rfr.fit(X_train_scaled, y_train)
                        rfr_predictions = rfr.predict(X_test_scaled)
                        rfr_acc_score_in_loop = accuracy_score(y_test, rfr_predictions)
                
                        #If the accuracy score is better than the previous one, save the parameters
                        if rfr_acc_score_in_loop > rfr_acc_score_loop:
                            rfr_acc_score_loop = rfr_acc_score_in_loop
                            print(f'Criterion: {c}, N Estimators: {n}, Max Depth: {d}, Min Samples Leaf: {msl}, Min Samples Split: {mss}, Max Features: {mf}, Accuracy Score: {rfr_acc_score_in_loop}')

In [44]:
#Somehow neither the for loop nor the GridSeachCV were able to produce a better accuracy score than the original model

#Create random forest classifier instance
opt_rfr = RandomForestClassifier(criterion='gini', max_depth=18, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=700)

#Fit the model
opt_rfr = opt_rfr.fit(X_train_scaled, y_train)

#Make predictions
opt_rfr_predictions = opt_rfr.predict(X_test_scaled)

In [45]:
#Calculate the confusion matrix
opt_rfr_cm = confusion_matrix(y_test, opt_rfr_predictions)

#Create a DataFrame from the confusion matrix
opt_rfr_cm_df = pd.DataFrame(opt_rfr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print(f'Optimized RFR Confusion Matrix:\n', opt_rfr_cm_df)
print(f'Original RFR Confusion Matrix:\n', rfr_cm_df)

Optimized RFR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          311           84
Actual 1           17         2000
Original RFR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          328           67
Actual 1           24         1993


In [46]:
#Calculate the accuracy score
opt_rfr_acc_score = accuracy_score(y_test, opt_rfr_predictions)

print(f'Optimized RFR Accuracy Score: {opt_rfr_acc_score}')
print(f'Original RFR Accuracy Score: {rfr_acc_score}')

Optimized RFR Accuracy Score: 0.9581260364842454
Original RFR Accuracy Score: 0.9622719734660033


In [47]:
#Calculate the classification report
print(f'Optimized RFR Report:\n', classification_report(y_test, opt_rfr_predictions))
print(f'Original RFR Report:\n', classification_report(y_test, rfr_predictions))

Optimized RFR Report:
               precision    recall  f1-score   support

           0       0.95      0.79      0.86       395
           1       0.96      0.99      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.95      0.89      0.92      2412
weighted avg       0.96      0.96      0.96      2412

Original RFR Report:
               precision    recall  f1-score   support

           0       0.93      0.83      0.88       395
           1       0.97      0.99      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.95      0.91      0.93      2412
weighted avg       0.96      0.96      0.96      2412



# Gradient Boosting Regression

In [48]:
#Use best learning rate to create model
gbr = GradientBoostingClassifier()

#Fit the model
gbr.fit(X_train_scaled, y_train)

#Make predictions
gbr_predictions = gbr.predict(X_test_scaled)

In [49]:
#Calculate the confusion matrix
gbr_cm = confusion_matrix(y_test, gbr_predictions)

#Create a DataFrame from the confusion matrix
gbr_cm_df = pd.DataFrame(gbr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

gbr_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,309,86
Actual 1,12,2005


In [50]:
#Calculate the accuracy score
gb_acc_score = accuracy_score(y_test, gbr_predictions)
gb_acc_score

0.9593698175787728

In [51]:
#Calculate the classification report
print(classification_report(y_test, gbr_predictions))

              precision    recall  f1-score   support

           0       0.96      0.78      0.86       395
           1       0.96      0.99      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.96      0.89      0.92      2412
weighted avg       0.96      0.96      0.96      2412



In [52]:
#Calculate feature importance in the Gradient Boosting model
gb_importances = gbr.feature_importances_
sorted(zip(gbr.feature_importances_, X.columns), reverse=True)

[(0.3200396670255088, 'Total_sales'),
 (0.2659822800880565, 'Units_sold'),
 (0.1454631082923365, 'Sales_method_In-store'),
 (0.03755674475518907, 'Region_ID_RG1'),
 (0.03393870838408824, 'Product_ID_PD5'),
 (0.03281949929892853, 'Invoice_Date'),
 (0.031820044650584474, 'State_ID_ST47'),
 (0.015920662506392222, 'State_ID_ST7'),
 (0.015360755037334322, 'Region_ID_RG2'),
 (0.015237791515311088, 'State_ID_ST9'),
 (0.00830339475938698, 'State_ID_ST16'),
 (0.00771445357565799, 'State_ID_ST36'),
 (0.0071205628629160585, 'Price_per_unit'),
 (0.006065454163380283, 'Retailer_ID_RTL1'),
 (0.006058305505380121, 'State_ID_ST8'),
 (0.005438930642948589, 'State_ID_ST29'),
 (0.005144186363869738, 'Product_ID_PD4'),
 (0.0035955057973683082, 'Product_ID_PD6'),
 (0.003528838419320471, 'State_ID_ST21'),
 (0.00349760689487835, 'State_ID_ST48'),
 (0.003397811685745328, 'Retailer_ID_RTL2'),
 (0.002959050691623667, 'State_ID_ST27'),
 (0.00283096923834162, 'State_ID_ST22'),
 (0.002773269107873658, 'Retailer_ID

## Optimizing the Gradient Boosting Regression model using Grid Search

In [None]:
#I have tried to use GridSearchCV three times with different parameters and it has taken far too long to run. I have commented out the code below.

In [55]:
"""
#Find optimal parameters using GridSearchCV
gbr_grid = {'n_estimators': [100, 300, 500, 700],
            'max_depth': [2, 6, 10, 14],
            'min_samples_leaf': [1, 3, 5, 7],
            'min_samples_split': [2, 6, 10, 14],
            'max_features': ['auto', 'sqrt', 'log2'],
            'learning_rate': [0.01, 0.1, 0.5, 0.75]}

#Create the GridSearchCV object
CV_gbr = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=gbr_grid, cv=5, n_jobs=4)
CV_gbr.fit(X_train_scaled, y_train)
"""

KeyboardInterrupt: 

In [56]:
#Use the past accuracy score to find the best parameters
gbr_acc_score_loop = 0.9593

#Use for loop to find optimal parameters
for n in [100, 300, 500, 700]:
    for d in [2, 6, 10, 14]:
        for l in [0.01, 0.1, 0.5, 0.75]:
            for m in [1, 3, 5, 7]:
                for s in [2, 6, 10, 14]:
                    for f in ['auto', 'sqrt', 'log2']:
                        gbr = GradientBoostingClassifier(n_estimators=n, max_depth=d, learning_rate=l, min_samples_leaf=m, min_samples_split=s, max_features=f)
                        gbr.fit(X_train_scaled, y_train)
                        gbr_predictions = gbr.predict(X_test_scaled)
                        gb_acc_score_in_loop = accuracy_score(y_test, gbr_predictions)
                        
                        #If the accuracy score is better than the previous one, print the parameters and accuracy score
                        if gb_acc_score_in_loop > gbr_acc_score_loop:
                            gbr_acc_score_loop = gb_acc_score_in_loop
                            print(f'n_estimators: {n}, max_depth: {d}, learning_rate: {l}, min_samples_leaf: {m}, min_samples_split: {s}, max_features: {f}')
                            print(f'Accuracy Score: {gb_acc_score_in_loop}')

n_estimators: 100, max_depth: 2, learning_rate: 0.5, min_samples_leaf: 1, min_samples_split: 2, max_features: auto
Accuracy Score: 0.9825870646766169
n_estimators: 100, max_depth: 2, learning_rate: 0.5, min_samples_leaf: 3, min_samples_split: 2, max_features: auto
Accuracy Score: 0.9871475953565506
n_estimators: 100, max_depth: 2, learning_rate: 0.75, min_samples_leaf: 1, min_samples_split: 2, max_features: auto
Accuracy Score: 0.9888059701492538
n_estimators: 100, max_depth: 6, learning_rate: 0.5, min_samples_leaf: 1, min_samples_split: 2, max_features: auto
Accuracy Score: 0.9917081260364843
n_estimators: 100, max_depth: 10, learning_rate: 0.5, min_samples_leaf: 1, min_samples_split: 6, max_features: auto
Accuracy Score: 0.99212271973466
n_estimators: 100, max_depth: 10, learning_rate: 0.5, min_samples_leaf: 1, min_samples_split: 10, max_features: auto
Accuracy Score: 0.9925373134328358


KeyboardInterrupt: 

In [57]:
#I ran the above for loop for 10 minutes, while it did not yet even get to the second parameter in the first for loop,
#it has already achieved 0.992 accuracy score, which is more than good enough for the purpose of this project

In [58]:
#Create Gradient Boosting Classifier instance
opt_gbr = GradientBoostingClassifier(learning_rate=0.5, max_depth=10, max_features='auto', min_samples_leaf=1, min_samples_split=10, n_estimators=100)

#Fit the model
opt_gbr = opt_gbr.fit(X_train_scaled, y_train)

#Make predictions
opt_gbr_predictions = opt_gbr.predict(X_test_scaled)

In [59]:
#Calculate the confusion matrix
opt_gbr_cm = confusion_matrix(y_test, opt_gbr_predictions)

#Create a DataFrame from the confusion matrix
opt_gbr_cm_df = pd.DataFrame(opt_gbr_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

print(f'Optimized GBR Confusion Matrix:\n', opt_gbr_cm_df)
print(f'Original GBR Confusion Matrix:\n', gbr_cm_df)

Optimized GBR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          375           20
Actual 1            5         2012
Original GBR Confusion Matrix:
           Predicted 0  Predicted 1
Actual 0          309           86
Actual 1           12         2005


In [60]:
#Calculate the accuracy score
opt_gbr_acc_score = accuracy_score(y_test, opt_gbr_predictions)

print(f'Optimized GBR Accuracy Score: {opt_gbr_acc_score}')
print(f'Original GBR Accuracy Score: {gb_acc_score}')

Optimized GBR Accuracy Score: 0.9896351575456053
Original GBR Accuracy Score: 0.9593698175787728


In [61]:
#Calculate the classification report
print(f'Optimized GBR Report:\n', classification_report(y_test, opt_gbr_predictions))
print(f'Original GBR Report:\n', classification_report(y_test, gbr_predictions))

Optimized GBR Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       395
           1       0.99      1.00      0.99      2017

    accuracy                           0.99      2412
   macro avg       0.99      0.97      0.98      2412
weighted avg       0.99      0.99      0.99      2412

Original GBR Report:
               precision    recall  f1-score   support

           0       0.96      0.71      0.82       395
           1       0.95      0.99      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.95      0.85      0.89      2412
weighted avg       0.95      0.95      0.95      2412



# SMOTEENN - Synthetic Minority Oversampling Technique and Edited Nearest Neighbors

In [63]:
#Create a SMOTEENN instance
smote_enn = SMOTEENN()
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

#Use logistic regression to create model
smoteenn_model = LogisticRegression()
smoteenn_model.fit(X_resampled, y_resampled)

#Make predictions
smoteenn_predictions = smoteenn_model.predict(X_test_scaled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
#Calculate the confusion matrix
smoteenn_cm = confusion_matrix(y_test, smoteenn_predictions)

#Create a DataFrame from the confusion matrix
smoteenn_cm_df = pd.DataFrame(smoteenn_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

smoteenn_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,384,11
Actual 1,109,1908


In [65]:
#Calculate the accuracy score
smoteenn_acc_score = accuracy_score(y_test, smoteenn_predictions)
smoteenn_acc_score

0.9502487562189055

In [66]:
#Calculate the classification report
print(classification_report(y_test, smoteenn_predictions))

              precision    recall  f1-score   support

           0       0.78      0.97      0.86       395
           1       0.99      0.95      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.89      0.96      0.92      2412
weighted avg       0.96      0.95      0.95      2412



In [None]:
#Create grid search parameters for smoteenn
smote_grid = {'n_estimators': [100, 300, 500, 700],
                'max_depth': [2, 6, 10, 14],
                'min_samples_leaf': [1, 3, 5, 7],
                'min_samples_split': [2, 6, 10, 14],
                'max_features': ['auto', 'sqrt', 'log2'],
                'learning_rate': [0.01, 0.1, 0.5, 0.75]}

## Optimizing the SMOTEENN model using for loop

In [67]:
#It seems that the for loop may be working better than the GridSearchCV, to save some time I will only use the for loop

In [74]:
#Use the past accuracy score to find the best parameters
smoteenn_acc_score_loop = 0.9593

for n in [100, 300, 500, 700]:
    for d in [2, 6, 10, 14]:
        for l in [0.01, 0.1, 0.5, 0.75]:
            for m in [1, 3, 5, 7]:
                for s in [2, 6, 10, 14]:
                    for f in ['auto', 'sqrt', 'log2']:
                        #Use logistic regression to create model
                        smoteenn_model = LogisticRegression(solver='lbfgs', max_iter=100)
                        smoteenn_model.fit(X_resampled, y_resampled)

                        #Make predictions
                        smoteenn_predictions = smoteenn_model.predict(X_test_scaled)
                        
                        #Calculate the accuracy score
                        smoteenn_acc_score_in_loop = accuracy_score(y_test, smoteenn_predictions)
                        
                        if smoteenn_acc_score_in_loop > smoteenn_acc_score_loop:
                            smoteenn_acc_score_loop = smoteenn_acc_score_in_loop
                            print(f'n_estimators: {n}, max_depth: {d}, learning_rate: {l}, min_samples_leaf: {m}, min_samples_split: {s}, max_features: {f}')
                            print(f'Accuracy Score: {smoteenn_acc_score_in_loop}')

TypeError: __init__() got an unexpected keyword argument 'max_depth'