In [1]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
#Import sales data
adidas_sales_df = pd.read_csv('../adidas_sales.csv')
adidas_sales_df.head()

Unnamed: 0,Retailer ID,Invoice Date,Region_ID,State_ID,Product_ID,Price per Unit,Units Sold,Operating Margin,Sales Method,Total Sales,Operating Profit
0,1185732,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0
1,1185732,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0
2,1185732,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0
3,1185732,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5
4,1185732,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0


In [15]:
adidas_sales_df.dtypes

Retailer ID                                  int64
Invoice Date                                object
Region_ID                                   object
State_ID                                    object
Product_ID                                  object
Price per Unit                             float64
Units Sold                                   int64
Operating Margin                           float64
Sales Method                                object
Total Sales                                float64
Operating Profit                           float64
Operating Margin Equal to or Above 0.35      int32
dtype: object

In [19]:
#Convert date to datetime
adidas_sales_df['Invoice Date'] = pd.to_datetime(adidas_sales_df['Invoice Date'])

In [20]:
adidas_sales_df['Operating Margin'].value_counts()

0.35    1309
0.40    1003
0.30     722
0.50     610
0.45     364
        ... 
0.48       1
0.27       1
0.17       1
0.65       1
0.17       1
Name: Operating Margin, Length: 110, dtype: int64

In [21]:
#Add binary column for operating margin above 0.35
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating Margin'] >= 0.35

#Convert boolean to int
adidas_sales_df['Operating Margin Equal to or Above 0.35'] = adidas_sales_df['Operating Margin Equal to or Above 0.35'].astype(int)

adidas_sales_df.head()

Unnamed: 0,Retailer ID,Invoice Date,Region_ID,State_ID,Product_ID,Price per Unit,Units Sold,Operating Margin,Sales Method,Total Sales,Operating Profit,Operating Margin Equal to or Above 0.35
0,1185732,2020-01-01,RG1,ST1,PD1,50.0,1200,0.5,In-store,60000.0,30000.0,1
1,1185732,2020-01-02,RG1,ST1,PD2,50.0,1000,0.3,In-store,50000.0,15000.0,0
2,1185732,2020-01-03,RG1,ST1,PD3,40.0,1000,0.35,In-store,40000.0,14000.0,1
3,1185732,2020-01-04,RG1,ST1,PD4,45.0,850,0.35,In-store,38250.0,13387.5,1
4,1185732,2020-01-05,RG1,ST1,PD5,60.0,900,0.3,In-store,54000.0,16200.0,0


In [22]:
#Define feature set
X = adidas_sales_df.drop(['Operating Margin', 'Operating Margin Equal to or Above 0.35'], axis=1)

#Define target set
y = adidas_sales_df['Operating Margin Equal to or Above 0.35']

In [23]:
#Get dummy variables for categorical data
X = pd.get_dummies(X)

In [26]:
X.columns

Index(['Retailer ID', 'Invoice Date', 'Price per Unit', 'Units Sold',
       'Total Sales', 'Operating Profit', 'Region_ID_RG1', 'Region_ID_RG2',
       'Region_ID_RG3', 'Region_ID_RG4', 'Region_ID_RG5', 'State_ID_ST1',
       'State_ID_ST10', 'State_ID_ST11', 'State_ID_ST12', 'State_ID_ST13',
       'State_ID_ST14', 'State_ID_ST15', 'State_ID_ST16', 'State_ID_ST17',
       'State_ID_ST18', 'State_ID_ST19', 'State_ID_ST2', 'State_ID_ST20',
       'State_ID_ST21', 'State_ID_ST22', 'State_ID_ST23', 'State_ID_ST24',
       'State_ID_ST25', 'State_ID_ST26', 'State_ID_ST27', 'State_ID_ST28',
       'State_ID_ST29', 'State_ID_ST3', 'State_ID_ST30', 'State_ID_ST31',
       'State_ID_ST32', 'State_ID_ST33', 'State_ID_ST34', 'State_ID_ST35',
       'State_ID_ST36', 'State_ID_ST37', 'State_ID_ST38', 'State_ID_ST39',
       'State_ID_ST4', 'State_ID_ST40', 'State_ID_ST41', 'State_ID_ST42',
       'State_ID_ST43', 'State_ID_ST44', 'State_ID_ST45', 'State_ID_ST46',
       'State_ID_ST47', 'State_ID

In [28]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
#Create a StandardScaler instance
scaler = StandardScaler()

#Fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train.drop(['Invoice Date'], axis=1))

#Scale the data
X_train_scaled = X_scaler.transform(X_train.drop(['Invoice Date'], axis=1))
X_test_scaled = X_scaler.transform(X_test.drop(['Invoice Date'], axis=1))

# Decision Tree Regression

In [33]:
#Create decision tree classifier instance
model = tree.DecisionTreeClassifier()

#Fit the model
model = model.fit(X_train_scaled, y_train)

In [34]:
#Make predictions
predictions = model.predict(X_test_scaled)

In [35]:
#Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

#Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,339,56
Actual 1,54,1963


In [37]:
#Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9543946932006634

In [39]:
#Calculate the classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       395
           1       0.97      0.97      0.97      2017

    accuracy                           0.95      2412
   macro avg       0.92      0.92      0.92      2412
weighted avg       0.95      0.95      0.95      2412

