In [89]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression

In [3]:
#Import sales data
adidas_sales_df = pd.read_csv('../adidas_sales.csv')
adidas_sales_df.head()

Unnamed: 0,Retailer ID,Invoice Date,Region_ID,State_ID,Product_ID,Price per Unit,Units Sold,Operating Margin,Sales Method,Total Sales,Operating Profit
0,1185732,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0
1,1185732,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0
2,1185732,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0
3,1185732,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5
4,1185732,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0


In [15]:
adidas_sales_df.dtypes

Retailer ID                                  int64
Invoice Date                                object
Region_ID                                   object
State_ID                                    object
Product_ID                                  object
Price per Unit                             float64
Units Sold                                   int64
Operating Margin                           float64
Sales Method                                object
Total Sales                                float64
Operating Profit                           float64
Operating Margin Equal to or Above 0.35      int32
dtype: object

In [19]:
#Convert date to datetime
adidas_sales_df['Invoice Date'] = pd.to_datetime(adidas_sales_df['Invoice Date'])

In [20]:
adidas_sales_df['Operating Margin'].value_counts()

0.35    1309
0.40    1003
0.30     722
0.50     610
0.45     364
        ... 
0.48       1
0.27       1
0.17       1
0.65       1
0.17       1
Name: Operating Margin, Length: 110, dtype: int64

In [21]:
#Add binary column for operating margin above 0.35
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating Margin'] >= 0.35

#Convert boolean to int
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating Margin Equal to or Above 0.35'].astype(int)

adidas_sales_df.head()

Unnamed: 0,Retailer ID,Invoice Date,Region_ID,State_ID,Product_ID,Price per Unit,Units Sold,Operating Margin,Sales Method,Total Sales,Operating Profit,Operating Margin Equal to or Above 0.35
0,1185732,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0,1
1,1185732,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0,0
2,1185732,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0,1
3,1185732,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5,1
4,1185732,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0,0


In [22]:
#Define feature set
X = adidas_sales_df.drop(['Operating Margin', 'Operating Margin Equal to or Above 0.35'], axis=1)

#Define target set
y = adidas_sales_df['Operating Margin Equal to or Above 0.35']

In [23]:
#Get dummy variables for categorical data
X = pd.get_dummies(X)

In [26]:
X.columns

Index(['Retailer ID', 'Invoice Date', 'Price per Unit', 'Units Sold',
       'Total Sales', 'Operating Profit', 'Region_ID_RG1', 'Region_ID_RG2',
       'Region_ID_RG3', 'Region_ID_RG4', 'Region_ID_RG5', 'State_ID_ST1',
       'State_ID_ST10', 'State_ID_ST11', 'State_ID_ST12', 'State_ID_ST13',
       'State_ID_ST14', 'State_ID_ST15', 'State_ID_ST16', 'State_ID_ST17',
       'State_ID_ST18', 'State_ID_ST19', 'State_ID_ST2', 'State_ID_ST20',
       'State_ID_ST21', 'State_ID_ST22', 'State_ID_ST23', 'State_ID_ST24',
       'State_ID_ST25', 'State_ID_ST26', 'State_ID_ST27', 'State_ID_ST28',
       'State_ID_ST29', 'State_ID_ST3', 'State_ID_ST30', 'State_ID_ST31',
       'State_ID_ST32', 'State_ID_ST33', 'State_ID_ST34', 'State_ID_ST35',
       'State_ID_ST36', 'State_ID_ST37', 'State_ID_ST38', 'State_ID_ST39',
       'State_ID_ST4', 'State_ID_ST40', 'State_ID_ST41', 'State_ID_ST42',
       'State_ID_ST43', 'State_ID_ST44', 'State_ID_ST45', 'State_ID_ST46',
       'State_ID_ST47', 'State_ID

In [28]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
#Create a StandardScaler instance
scaler = StandardScaler()

#Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train.drop(['Invoice Date'], axis=1))

#Scale the data
X_train_scaled = X_scaler.transform(X_train.drop(['Invoice Date'], axis=1))
X_test_scaled = X_scaler.transform(X_test.drop(['Invoice Date'], axis=1))

# Decision Tree Regression

In [33]:
#Create decision tree classifier instance
model = tree.DecisionTreeClassifier()

#Fit the model
model = model.fit(X_train_scaled, y_train)

In [34]:
#Make predictions
predictions = model.predict(X_test_scaled)

In [35]:
#Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,339,56
Actual 1,54,1963


In [37]:
#Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9543946932006634

In [39]:
#Calculate the classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       395
           1       0.97      0.97      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.92      0.92      0.92      2412
weighted avg       0.95      0.95      0.95      2412



# Random Forest Regression

In [41]:
#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [42]:
#Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [43]:
#Make predictions
rf_predictions = rf_model.predict(X_test_scaled)

In [44]:
#Calculate the confusion matrix
rf_cm = confusion_matrix(y_test, rf_predictions)

#Create a DataFrame from the confusion matrix
rf_cm_df = pd.DataFrame(rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

rf_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,329,66
Actual 1,29,1988


In [45]:
#Calculate the accuracy score
rf_acc_score = accuracy_score(y_test, rf_predictions)
rf_acc_score

0.9606135986733002

In [46]:
#Calculate the classification report
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

           0       0.92      0.83      0.87       395
           1       0.97      0.99      0.98      2017

    accuracy                           0.96      2412
   macro avg       0.94      0.91      0.93      2412
weighted avg       0.96      0.96      0.96      2412



In [48]:
#Calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.15599970935714697, 'Total Sales'),
 (0.12125717736262873, 'Units Sold'),
 (0.09595311101453556, 'Price per Unit'),
 (0.08873269907925498, 'Invoice Date'),
 (0.05563790591328575, 'Sales Method_In-store'),
 (0.03648906038298604, 'Product_ID_PD5'),
 (0.0305420538524161, 'Retailer ID'),
 (0.025976134880442692, 'Product_ID_PD4'),
 (0.024625105403040252, 'State_ID_ST47'),
 (0.024278396820122145, 'Product_ID_PD1'),
 (0.022459407018049535, 'Product_ID_PD6'),
 (0.020260544420410562, 'Sales Method_Online'),
 (0.018868836736298907, 'State_ID_ST9'),
 (0.018766920182543577, 'Product_ID_PD3'),
 (0.01851081400175879, 'Product_ID_PD2'),
 (0.016291515837811296, 'Region_ID_RG2'),
 (0.01614881150162768, 'Region_ID_RG1'),
 (0.015072106882428113, 'State_ID_ST16'),
 (0.013222242425645918, 'State_ID_ST7'),
 (0.012659242336347503, 'State_ID_ST8'),
 (0.010038027978326307, 'State_ID_ST29'),
 (0.009952176998546305, 'Operating Profit'),
 (0.007445013512547144, 'Region_ID_RG3'),
 (0.00657217059166161, 'State_I

# Gradient Boosting Regression

In [62]:
#Identify best learning rate
learning_rates = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    gb_model = GradientBoostingClassifier(n_estimators=60, learning_rate=learning_rate, max_features=5, max_depth=3, random_state=0)
    gb_model.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_model.score(X_train_scaled,y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_model.score(X_test_scaled,y_test)))

Learning rate:  0.01
Accuracy score (training): 0.841
Accuracy score (validation): 0.836
Learning rate:  0.05
Accuracy score (training): 0.853
Accuracy score (validation): 0.850
Learning rate:  0.1
Accuracy score (training): 0.879
Accuracy score (validation): 0.868
Learning rate:  0.25
Accuracy score (training): 0.903
Accuracy score (validation): 0.890
Learning rate:  0.5
Accuracy score (training): 0.938
Accuracy score (validation): 0.922
Learning rate:  0.75
Accuracy score (training): 0.945
Accuracy score (validation): 0.934
Learning rate:  1
Accuracy score (training): 0.956
Accuracy score (validation): 0.936


In [80]:
#Use best learning rate to create model
gb_model = GradientBoostingClassifier(n_estimators=60, learning_rate=1, max_features=10, max_depth=6, random_state=0)

#Fit the model
gb_model.fit(X_train_scaled, y_train)

#Make predictions
gb_predictions = gb_model.predict(X_test_scaled)

In [81]:
#Calculate the confusion matrix
gb_cm = confusion_matrix(y_test, gb_predictions)

#Create a DataFrame from the confusion matrix
gb_cm_df = pd.DataFrame(gb_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

gb_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,335,60
Actual 1,53,1964


In [82]:
#Calculate the accuracy score
gb_acc_score = accuracy_score(y_test, gb_predictions)
gb_acc_score

0.953150912106136

In [83]:
#Calculate the classification report
print(classification_report(y_test, gb_predictions))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       395
           1       0.97      0.97      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.92      0.91      0.91      2412
weighted avg       0.95      0.95      0.95      2412



In [87]:
#Calculate feature importance in the Gradient Boosting model
gb_importances = gb_model.feature_importances_
sorted(zip(gb_model.feature_importances_, X.columns), reverse=True)

[(0.1273775420725895, 'Total Sales'),
 (0.10720015545543943, 'Units Sold'),
 (0.10560384098865462, 'Sales Method_In-store'),
 (0.09026777093634329, 'Price per Unit'),
 (0.08858855347884653, 'Invoice Date'),
 (0.04539245926073661, 'Product_ID_PD5'),
 (0.04441264939382341, 'Product_ID_PD4'),
 (0.03403834771472466, 'State_ID_ST8'),
 (0.03178918894743252, 'Region_ID_RG2'),
 (0.02647756129319641, 'Sales Method_Online'),
 (0.025763628034133006, 'Operating Profit'),
 (0.02140124157571539, 'State_ID_ST9'),
 (0.020727292832291154, 'Product_ID_PD1'),
 (0.018289634498606565, 'State_ID_ST47'),
 (0.018134273716296292, 'Product_ID_PD3'),
 (0.016885135608371522, 'Retailer ID'),
 (0.014266720668490971, 'Product_ID_PD6'),
 (0.011902287389132633, 'Product_ID_PD2'),
 (0.010717174975281947, 'State_ID_ST22'),
 (0.010187063903601837, 'State_ID_ST29'),
 (0.009632825591330416, 'State_ID_ST36'),
 (0.007452005544392202, 'State_ID_ST1'),
 (0.007112014715399898, 'Region_ID_RG4'),
 (0.006629045464673718, 'State_ID

# SMOTEENN - Synthetic Minority Oversampling Technique and Edited Nearest Neighbors

In [88]:
#Create a SMOTEENN instance
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

In [91]:
#Use logistic regression to create model
smoteenn_model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
smoteenn_model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [92]:
#Make predictions
smoteenn_predictions = smoteenn_model.predict(X_test_scaled)

In [93]:
#Calculate the confusion matrix
smoteenn_cm = confusion_matrix(y_test, smoteenn_predictions)

#Create a DataFrame from the confusion matrix
smoteenn_cm_df = pd.DataFrame(smoteenn_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

smoteenn_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,381,14
Actual 1,101,1916


In [94]:
#Calculate the accuracy score
smoteenn_acc_score = accuracy_score(y_test, smoteenn_predictions)
smoteenn_acc_score

0.9523217247097844

In [95]:
#Calculate the classification report
print(classification_report(y_test, smoteenn_predictions))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       395
           1       0.99      0.95      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.89      0.96      0.92      2412
weighted avg       0.96      0.95      0.95      2412

