Reminders

Remember to cite sckit-learn for using their software (says so on website)

In [1]:
import numpy as np
import pandas as pd

RANDOM_STATE = 0

Data Cleaning

In [2]:
data = pd.read_csv('leading_retailers_2021.csv', encoding='latin-1')

# Converts columns from strings to integers
int_cols = ['FY2021_retail_revenue', 'FY2021_parent_company/ group revenue', 'Geographies_of_operation']
data[int_cols] = data[int_cols].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce')

# Converts percentage strings to floats
float_cols = ['FY2016- 2021_retail_revenueCAGR3', 'FY2020-2021_retail_revenue_growth', 'FY2021_Net_profit_margin']
data[float_cols] = data[float_cols].replace('%', '', regex=True).apply(pd.to_numeric, errors='coerce') / 100.0

# Strip whitespace from strings
str_cols = ['Location', 'Dominant_operational_format']
data[str_cols] = data[str_cols].apply(lambda x: x.str.strip())

In [19]:
data

Unnamed: 0,Rank,Name of Company,Location,FY2021_retail_revenue,FY2021_parent_company/ group revenue,Dominant_operational_format,Geographies_of_operation,FY2016- 2021_retail_revenueCAGR3,FY2020-2021_retail_revenue_growth,FY2021_Net_profit_margin
0,1,Walmart Inc,United States,572754,572754,Hypermarket/ supercenter,24,0.033,0.024,0.024
1,2,"Amazon.com, Inc.",United States,239150,469822,Non-store,21,0.204,0.120,0.071
2,3,Costco Wholesale Corporation,United States,195929,195929,Cash & carry/ warehouse club,12,0.105,0.175,0.026
3,4,Schwarz Group,Germany,153754,156209,Discount store,33,0.078,0.055,
4,5,"The Home Depot, Inc",United States,151157,151157,Home improvement,3,0.098,0.144,0.109
...,...,...,...,...,...,...,...,...,...,...
245,246,EG Group Limited,United Kingdom,4606,26420,Convenience/ forecourt store,10,0.751,0.043,
246,247,"Yaoko Co., Ltd.",Japan,4575,4771,Supermarket,1,0.094,0.055,0.029
247,248,"Daiso Industries Co., Ltd",Japan,4546,4889,Discount department store,26,0.055,0.044,
248,249,Shufersal Ltd.,Israel,4544,4561,Discount store,1,0.046,-0.031,0.027


In [3]:
# All columns except for Rank, Name of company, and Dominant operational format
# I've also tried using [:, 6:-1] and [:, 7:-1] as I realized column 2
# are strongly correlated with the Rank, and the last column has a lot of NaN values
x = data.iloc[:, [2, 3, 4, 6, 7, 8, 9]].values

# Dominant operational format column
y = data.iloc[:, 5].values

In [4]:
# Replace missing values with an average
### MAKE SURE TO EXCLUDE THE LOCATION COLUMN IF IT IS IN X ###
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, 1:] = imputer.fit_transform(x[:, 1:])

In [46]:
x

array([[ 2.40000000e+01,  3.30000000e-02,  2.40000000e-02],
       [ 2.10000000e+01,  2.04000000e-01,  1.20000000e-01],
       [ 1.20000000e+01,  1.05000000e-01,  1.75000000e-01],
       [ 3.30000000e+01,  7.80000000e-02,  5.50000000e-02],
       [ 3.00000000e+00,  9.80000000e-02,  1.44000000e-01],
       [ 1.00000000e+00,  3.50000000e-02,  4.10000000e-02],
       [ 1.00000000e+00,  2.80000000e-01,  2.51000000e-01],
       [ 6.00000000e+00,  4.70000000e-02,  3.70000000e-02],
       [ 1.90000000e+01,  4.80000000e-02, -4.00000000e-03],
       [ 1.00000000e+00,  8.50000000e-02,  1.32000000e-01],
       [ 1.00000000e+00,  4.30000000e-02,  9.80000000e-02],
       [ 2.00000000e+00,  8.20000000e-02,  7.40000000e-02],
       [ 1.00000000e+01,  3.90000000e-02,  1.20000000e-02],
       [ 5.00000000e+00,  1.90000000e-02,  5.70000000e-02],
       [ 1.80000000e+01,  9.00000000e-02,  5.47000000e-01],
       [ 1.00000000e+00,  4.80000000e-02,  2.30000000e-02],
       [ 1.10000000e+01,  1.00000000e-02

In [5]:
# Encode Categorical Data in X
### ONLY RUN THIS WHEN X CONTAINS THE LOCATION COLUMN ###
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder




































ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = ct.fit_transform(x).toarray()  # Encodes Location Column into 40 separate columns

In [6]:
# Encode Categorical Data in Y
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [49]:
y

array([10, 11,  1,  6,  9, 13, 11,  7,  6,  4,  7,  9, 13, 10,  2, 13, 10,
       13, 13, 12, 13,  8,  0, 13, 12, 13, 10, 13, 10, 10, 10, 10,  6, 13,
        0, 10, 13,  9,  6, 13, 10, 13, 13,  6,  8, 13,  5,  1,  2,  3, 13,
        0, 13,  7,  9, 10,  0,  8,  0, 13,  0,  3,  9, 13,  2,  7, 11, 12,
        0, 11,  0,  1,  2, 11, 12,  2, 13,  4, 13,  3, 12, 13, 13, 13, 13,
        3, 10, 11,  8, 11,  6, 12,  3, 10,  7,  9, 12, 12,  7, 12, 11,  9,
       13, 13,  6,  8,  0,  9, 13, 12,  6, 13, 12,  2,  6, 10,  3,  9, 10,
        0,  0, 13, 12,  3,  3, 13, 10, 13,  0, 10, 11, 13,  9, 10, 13,  2,
       12, 12, 11, 13,  3,  9, 13,  4,  6,  7, 10, 13,  6,  6, 12, 12,  6,
       13, 12, 13,  8, 10,  0,  8, 12,  2, 12, 12, 12, 12,  8, 12, 10,  9,
       13,  2,  0, 12,  3,  8,  8,  8, 13, 13, 10,  7, 12,  0,  7,  3,  9,
        6,  8,  8,  3,  2, 12, 12, 12,  0,  3, 10,  6, 13, 13, 12, 13,  0,
       13,  2, 12,  8, 12, 12,  7, 10,  8, 13, 10,  7, 12,  3,  2,  6,  8,
       13, 13, 12,  3,  0

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)

In [8]:
print(x_train[:, 40:])

[[ 1.66670000e+04  1.66670000e+04  1.00000000e+00  6.20000000e-02
   8.00000000e-02  2.60000000e-02]
 [ 7.44000000e+03  7.44000000e+03  1.00000000e+02  3.84000000e-01
   8.80000000e-02  2.60000000e-02]
 [ 6.56000000e+03  6.56000000e+03  1.00000000e+00  9.60000000e-02
   8.60000000e-02  2.60000000e-02]
 ...
 [ 9.89600000e+03  9.93100000e+03  2.20000000e+01 -1.20000000e-02
   1.00000000e-02  4.24894737e-02]
 [ 2.46200000e+04  2.95950000e+04  2.40000000e+01 -8.80000000e-02
  -5.00000000e-02 -2.00000000e-03]
 [ 6.78500000e+03  6.78500000e+03  7.40000000e+01 -2.70000000e-02
   2.53000000e-01  9.50000000e-02]]


In [8]:
# Feature Scaling
# from sklearn.preprocessing import MaxAbsScaler
# scaler = MaxAbsScaler()  # Seems to act like the Normalization we learned in class

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# I suspect that our data doesn't have a gaussian distribution because
# our data picks the top 250 companies, instead of a random sample
# I think this would make using StandardScaler inappropriate

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
x_train[:, 40:]

array([[ 0.02909975,  0.02909975,  0.00465116,  0.04049641,  0.0990099 ,
         0.07514451],
       [ 0.01298987,  0.01298987,  0.46511628,  0.25081646,  0.10891089,
         0.07514451],
       [ 0.01145343,  0.01145343,  0.00465116,  0.06270411,  0.10643564,
         0.07514451],
       ...,
       [ 0.01727792,  0.01733903,  0.10232558, -0.00783801,  0.01237624,
         0.12280195],
       [ 0.0429853 ,  0.0516714 ,  0.11162791, -0.05747877, -0.06188119,
        -0.00578035],
       [ 0.01184627,  0.01184627,  0.34418605, -0.01763553,  0.31311881,
         0.27456647]])

Decision Tree Classification

In [9]:
# Creating and Training Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy', random_state=RANDOM_STATE)
dtc.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [11]:
# Predicting test results with decision tree classifier
dtc_y_pred = dtc.predict(x_test)
print(np.concatenate((dtc_y_pred.reshape(len(dtc_y_pred), 1), y_test.reshape(len(dtc_y_pred), 1)), 1))

[[ 8  0]
 [10 12]
 [ 0  3]
 [10 10]
 [ 3 12]
 [ 2  2]
 [13  6]
 [13 13]
 [13 13]
 [13 13]
 [13  2]
 [ 8 11]
 [13 13]
 [ 2  9]
 [13  3]
 [11 11]
 [ 0  0]
 [ 8 13]
 [13  6]
 [ 9 12]
 [13  8]
 [10 13]
 [13 13]
 [13 13]
 [11 12]
 [ 8  6]
 [ 6  6]
 [13 10]
 [ 0 12]
 [11  9]
 [10 13]
 [13 13]
 [ 2  6]
 [12 12]
 [10  8]
 [12 12]
 [ 3 11]
 [ 7 13]
 [ 3  0]
 [ 2 10]
 [ 8 10]
 [13 13]
 [ 0  9]
 [10  8]
 [ 6 11]
 [12 12]
 [10 12]
 [ 9  8]
 [13  2]
 [12 13]]


In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(confusion_matrix(y_test, dtc_y_pred))
print(accuracy_score(y_test, dtc_y_pred))

[[1 0 1 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 2]
 [1 0 0 0 0 0 0 0 0 0 1]
 [0 1 0 1 0 1 0 0 0 0 2]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 2 0 0 1]
 [1 1 0 0 0 0 0 0 1 0 0]
 [0 1 0 0 0 1 0 1 0 0 1]
 [0 0 1 1 0 1 0 0 1 0 0]
 [1 0 1 0 0 0 1 2 1 3 0]
 [0 0 0 0 1 1 0 2 0 1 8]]
0.32


K-Nearest Neighbors Classification

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(x_train, y_train)

KNeighborsClassifier()

In [14]:
# Predicting Test Results
knc_y_pred = knc.predict(x_test)
print(np.concatenate((knc_y_pred.reshape(len(knc_y_pred), 1), y_test.reshape(len(knc_y_pred), 1)), 1))

[[ 7  0]
 [13 12]
 [13  3]
 [ 2 10]
 [ 3 12]
 [12  2]
 [13  6]
 [13 13]
 [ 6 13]
 [13 13]
 [ 7  2]
 [12 11]
 [13 13]
 [ 7  9]
 [ 1  3]
 [12 11]
 [12  0]
 [13 13]
 [12  6]
 [12 12]
 [13  8]
 [10 13]
 [ 0 13]
 [13 13]
 [12 12]
 [ 7  6]
 [13  6]
 [13 10]
 [12 12]
 [13  9]
 [12 13]
 [ 1 13]
 [ 0  6]
 [12 12]
 [10  8]
 [13 12]
 [ 3 11]
 [ 1 13]
 [ 3  0]
 [12 10]
 [10 10]
 [13 13]
 [13  9]
 [13  8]
 [12 11]
 [12 12]
 [13 12]
 [ 7  8]
 [13  2]
 [13 13]]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(confusion_matrix(y_test, knc_y_pred))
print(accuracy_score(y_test, knc_y_pred))

[[0 0 0 1 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 1 1]
 [0 1 0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 1 0 0 0 0 1 2]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 2]
 [0 0 0 0 0 1 0 0 0 0 0 2]
 [0 0 1 0 0 0 0 0 1 0 1 1]
 [0 0 0 1 0 0 0 0 0 0 3 0]
 [0 0 0 1 0 0 0 0 0 0 5 3]
 [1 2 0 0 1 0 0 0 1 0 1 7]]
0.26


Gaussian Naive Bayes Classifier

In [16]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB()

In [17]:
# Predicting Test Results
gnb_y_pred = gnb.predict(x_test)
print(np.concatenate((gnb_y_pred.reshape(len(gnb_y_pred), 1), y_test.reshape(len(gnb_y_pred), 1)), 1))

[[ 4  0]
 [ 0 12]
 [ 0  3]
 [ 3 10]
 [11 12]
 [12  2]
 [ 1  6]
 [ 0 13]
 [ 6 13]
 [ 0 13]
 [ 4  2]
 [ 9 11]
 [ 1 13]
 [ 1  9]
 [ 3  3]
 [12 11]
 [ 9  0]
 [ 7 13]
 [ 0  6]
 [ 4 12]
 [ 4  8]
 [ 9 13]
 [ 9 13]
 [ 1 13]
 [11 12]
 [12  6]
 [ 1  6]
 [ 1 10]
 [ 9 12]
 [ 0  9]
 [ 6 13]
 [ 3 13]
 [ 3  6]
 [11 12]
 [11  8]
 [ 1 12]
 [ 1 11]
 [ 1 13]
 [ 1  0]
 [ 3 10]
 [11 10]
 [ 4 13]
 [11  9]
 [ 9  8]
 [ 1 11]
 [ 7 12]
 [ 9 12]
 [ 1  8]
 [ 3  2]
 [ 3 13]]


In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(confusion_matrix(y_test, gnb_y_pred))
print(accuracy_score(y_test, gnb_y_pred))

[[0 1 0 0 1 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0 1 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 2 0 1 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 1 0 1 0 0]
 [1 1 0 0 0 0 0 0 0 0 1 0 0]
 [0 1 0 2 0 0 0 0 0 0 1 0 0]
 [0 2 0 0 0 0 0 0 1 0 0 1 0]
 [1 1 0 0 1 0 1 0 2 0 3 0 0]
 [2 3 0 2 1 2 1 0 2 0 0 0 0]]
0.02
