In [1]:
import numpy as np
import pandas as pd

RANDOM_STATE = 0

Data Cleaning

In [2]:
data = pd.read_csv('output_file.csv', encoding='latin-1')

# Converts columns from strings to integers
int_cols = ['FY2021_retail_revenue', 'FY2021_parent_company/ group revenue', 'Geographies_of_operation']
data[int_cols] = data[int_cols].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce')

# Converts percentage strings to floats
float_cols = ['FY2016- 2021_retail_revenueCAGR3', 'FY2020-2021_retail_revenue_growth', 'FY2021_Net_profit_margin']
data[float_cols] = data[float_cols].replace('%', '', regex=True).apply(pd.to_numeric, errors='coerce') / 100.0

# Strip whitespace from strings
str_cols = ['Location', 'Dominant_operational_format']
data[str_cols] = data[str_cols].apply(lambda x: x.str.strip())

In [None]:
data

In [3]:
x = data.iloc[:, [2, 4, 5, 6, 7, 8, 9]].values

# Dominant operational format column
y = data.iloc[:, 3].values

In [26]:
len(data['Dominant_operational_format'].unique())

14

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, [1, 3, 4, 5, 6]] = imputer.fit_transform(x[:, [1, 3, 4, 5, 6]])

In [22]:
x

array([['United States', 572754.0, 'Hypermarket/ supercenter', ...,
        0.033, 0.024, 0.024],
       ['United States', 469822.0, 'Non-store', ..., 0.204, 0.12, 0.071],
       ['United States', 195929.0, 'Cash & carry/ warehouse club', ...,
        0.105, 0.175, 0.026000000000000002],
       ...,
       ['Japan', 4889.0, 'Discount department store', ..., 0.055,
        0.044000000000000004, 0.04248947368421053],
       ['Israel', 4561.0, 'Discount store', ..., 0.046, -0.031,
        0.027000000000000003],
       ['United Kingdom', 4478.0, 'Other specialty', ..., 0.158, 0.196,
        0.095]], dtype=object)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2])], remainder='passthrough')
x = ct.fit_transform(x).toarray()  # Encodes Location Column into 40 separate columns and the Dominant Operational Format into 14

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(min_samples_split=6, random_state=RANDOM_STATE)
dtr.fit(x_train, y_train)

DecisionTreeRegressor(min_samples_split=6, random_state=0)

In [9]:
dtr_y_pred = dtr.predict(x_test)
print(np.concatenate((dtr_y_pred.reshape(len(dtr_y_pred), 1), y_test.reshape(len(dtr_y_pred), 1)), 1))

[[  5188.2    5233.  ]
 [  9902.8    9509.  ]
 [ 14522.8   13242.  ]
 [  8112.     7650.  ]
 [  8112.     7757.  ]
 [  8858.4    7372.  ]
 [  5491.     6151.  ]
 [ 14522.8   14500.  ]
 [ 19390.5   18089.  ]
 [ 11637.6    7718.  ]
 [  6299.5    5436.  ]
 [  5359.5    5150.  ]
 [ 11637.6   11200.  ]
 [  6385.5    6426.  ]
 [  9902.8   10039.  ]
 [ 17831.5   16488.  ]
 [  6385.5    5983.  ]
 [  7542.75   8204.  ]
 [  8112.     7859.  ]
 [ 12614.6   12731.  ]
 [  8112.     7664.  ]
 [  4606.    24793.  ]
 [  4948.75   4916.  ]
 [  9902.8   10700.  ]
 [  8034.5    7882.  ]
 [ 12231.5   13593.  ]
 [ 40645.   120947.  ]
 [ 19390.5   21000.  ]
 [  6385.5    6298.  ]
 [ 30128.75  30035.  ]
 [  6037.     5942.  ]
 [ 14522.8   14979.  ]
 [ 11637.6   10986.  ]
 [  6798.25   6773.  ]
 [  7164.5    7203.  ]
 [  7542.75   8500.  ]
 [  7560.     8891.  ]
 [ 98698.25 136971.  ]
 [ 48573.    48550.  ]
 [  8983.25   9242.  ]
 [  8749.     9690.  ]
 [ 73436.    89381.  ]
 [ 19390.5   11217.  ]
 [  6676.5 

In [21]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=RANDOM_STATE)
rfr.fit(x_train, y_train)

RandomForestRegressor(random_state=0)

In [22]:
rfr_y_pred = rfr.predict(x_test)
print(np.concatenate((rfr_y_pred.reshape(len(rfr_y_pred), 1), y_test.reshape(len(rfr_y_pred), 1)), 1))

[[  5243.73   5233.  ]
 [  9273.88   9509.  ]
 [ 13688.82  13242.  ]
 [  7336.17   7650.  ]
 [  7844.79   7757.  ]
 [  8098.6    7372.  ]
 [  5950.1    6151.  ]
 [ 14406.48  14500.  ]
 [ 19272.76  18089.  ]
 [ 10728.76   7718.  ]
 [  6314.36   5436.  ]
 [  5300.42   5150.  ]
 [ 11017.99  11200.  ]
 [  6342.86   6426.  ]
 [ 10145.81  10039.  ]
 [ 16821.74  16488.  ]
 [  6355.76   5983.  ]
 [  8475.71   8204.  ]
 [  7924.97   7859.  ]
 [ 12311.85  12731.  ]
 [  7764.09   7664.  ]
 [ 19408.2   24793.  ]
 [  4974.     4916.  ]
 [ 10289.06  10700.  ]
 [  7624.77   7882.  ]
 [ 12481.23  13593.  ]
 [110570.49 120947.  ]
 [ 19187.44  21000.  ]
 [  6388.38   6298.  ]
 [ 28829.43  30035.  ]
 [  5902.82   5942.  ]
 [ 14319.88  14979.  ]
 [ 10247.5   10986.  ]
 [  6580.9    6773.  ]
 [  7028.66   7203.  ]
 [  7978.8    8500.  ]
 [  8655.35   8891.  ]
 [101011.71 136971.  ]
 [ 43639.72  48550.  ]
 [  9025.77   9242.  ]
 [  9946.11   9690.  ]
 [ 80277.7   89381.  ]
 [ 18894.52  11217.  ]
 [  6581.37

Metrics

In [23]:
from sklearn.metrics import r2_score, mean_squared_error

In [24]:
# Decision Tree Regression Metrics

dtr_r_squared = r2_score(y_test, dtr_y_pred)
dtr_mean_squared_error = mean_squared_error(y_test, dtr_y_pred)

print(f'r^2: {dtr_r_squared}; sum of squared residuals: {dtr_mean_squared_error}')

r^2: 0.7421438764558577; sum of squared residuals: 178630763.76380002


In [None]:
# Random Forest Regressor

rfr_r_squared = r2_score(y_test, rfr_y_pred)
rfr_mean_squared_error = mean_squared_error(y_test, rfr_y_pred)

print(f'r^2: {rfr_r_squared}; sum of squared residuals: {rfr_mean_squared_error}')