In [1]:
import numpy as np
import pandas as pd

RANDOM_STATE = 0

np.set_printoptions(precision=2)

Data Cleaning and Preprocessing

In [34]:
data = pd.read_csv('partially_cleaned_dataset.csv')

# Converts columns from strings to floats
int_cols = ['FY2021_retail_revenue', 'FY2021_parent_company/ group revenue', 'Geographies_of_operation']
data[int_cols] = data[int_cols].apply(pd.to_numeric, errors='coerce')

# Converts percentage strings to floats
float_cols = ['FY2016- 2021_retail_revenueCAGR3', 'FY2020-2021_retail_revenue_growth', 'FY2021_Net_profit_margin']
data[float_cols] = data[float_cols].replace('%', '', regex=True).apply(pd.to_numeric, errors='coerce') / 100.0

In [35]:
data

Unnamed: 0,Rank,Name of Company,Location,FY2021_retail_revenue,FY2021_parent_company/ group revenue,Dominant_operational_format,Geographies_of_operation,FY2016- 2021_retail_revenueCAGR3,FY2020-2021_retail_revenue_growth,FY2021_Net_profit_margin
0,1,Walmart Inc,United States,572754,572754,Hypermarket/ supercenter,24,0.033,0.024,0.024
1,2,"Amazon.com, Inc.",United States,239150,469822,Non-store,21,0.204,0.120,0.071
2,3,Costco Wholesale Corporation,United States,195929,195929,Cash & carry/ warehouse club,12,0.105,0.175,0.026
3,4,Schwarz Group,Germany,153754,156209,Discount store,33,0.078,0.055,
4,5,"The Home Depot, Inc",United States,151157,151157,Home improvement,3,0.098,0.144,0.109
...,...,...,...,...,...,...,...,...,...,...
245,246,EG Group Limited,United Kingdom,4606,26420,Convenience/ forecourt store,10,0.751,0.043,
246,247,"Yaoko Co., Ltd.",Japan,4575,4771,Supermarket,1,0.094,0.055,0.029
247,248,"Daiso Industries Co., Ltd",Japan,4546,4889,Discount department store,26,0.055,0.044,
248,249,Shufersal Ltd.,Israel,4544,4561,Discount store,1,0.046,-0.031,0.027


In [3]:
# All columns except Rank, Name of Company, and FY2021_retail_revenue
x = data.iloc[:, [2, 4, 5, 6, 7, 8, 9]].values

# All columns in `x` except for Location and Dominant_operational_format
# (No categorical data)
no_cat_x = data.iloc[:, [4, 6, 7, 8, 9]].values

# FY2021_retail_revenue column
y = data.iloc[:, 3].values

Filling in Missing Data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Apply imputer to all features except Location and Geographies of Operation
x[:, [1, 3, 4, 5, 6]] = imputer.fit_transform(x[:, [1, 3, 4, 5, 6]])

# All features can have a mean because there are no categorical variables in `no_cat_x`
no_cat_x = imputer.fit_transform(no_cat_x)

Encode the Categorical Variables

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Will encode the Location Column into 40 separate columns and the Dominant Operational Format into 14
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2])], remainder='passthrough')
x = ct.fit_transform(x).toarray()

# `no_cat_x` doesn't need to go through this step because it has no categorical variables

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)

# `y` is shared between `x` and `no_cat_x`, but their testing sets are different, so `no_cat_x`
# will have a seperate `y_train` and `y_test`
no_cat_x_train, no_cat_x_test, no_cat_y_train, no_cat_y_test = train_test_split(no_cat_x, y, test_size=0.2, random_state=RANDOM_STATE)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

no_cat_x_train = scaler.fit_transform(no_cat_x_train)
no_cat_x_test = scaler.transform(no_cat_x_test)

Decision Tree Regression

In [9]:
from sklearn.tree import DecisionTreeRegressor

# Trees where the minimum sample split is only 2
dtr_2 = DecisionTreeRegressor(random_state=RANDOM_STATE)
dtr_2.fit(x_train, y_train)

no_cat_dtr_2 = DecisionTreeRegressor(random_state=RANDOM_STATE)
no_cat_dtr_2.fit(no_cat_x_train, no_cat_y_train)

# Trees where the minimum sample split is 20
dtr_20 = DecisionTreeRegressor(min_samples_split=20, random_state=RANDOM_STATE)
dtr_20.fit(x_train, y_train)

no_cat_dtr_20 = DecisionTreeRegressor(min_samples_split=20, random_state=RANDOM_STATE)
no_cat_dtr_20.fit(no_cat_x_train, no_cat_y_train)

DecisionTreeRegressor(min_samples_split=20, random_state=0)

In [10]:
dtr_2_y_pred = dtr_2.predict(x_test)
dtr_20_y_pred = dtr_20.predict(x_test)

no_cat_dtr_2_y_pred = no_cat_dtr_2.predict(no_cat_x_test)
no_cat_dtr_20_y_pred = no_cat_dtr_20.predict(no_cat_x_test)

In [None]:
print(np.concatenate((dtr_2_y_pred.reshape(len(dtr_2_y_pred), 1),
                      y_test.reshape(len(dtr_2_y_pred), 1)), 1))

In [None]:
print(np.concatenate((dtr_20_y_pred.reshape(len(dtr_20_y_pred), 1),
                      y_test.reshape(len(dtr_20_y_pred), 1)), 1))

In [None]:
print(np.concatenate((no_cat_dtr_2_y_pred.reshape(len(no_cat_dtr_2_y_pred), 1),
                      no_cat_y_test.reshape(len(no_cat_dtr_2_y_pred), 1)), 1))

In [None]:
print(np.concatenate((no_cat_dtr_20_y_pred.reshape(len(no_cat_dtr_20_y_pred), 1),
                      no_cat_y_test.reshape(len(no_cat_dtr_20_y_pred), 1)), 1))

Random Forest Regression

In [15]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=RANDOM_STATE)
rfr.fit(x_train, y_train)

no_cat_rfr = RandomForestRegressor(random_state=RANDOM_STATE)
no_cat_rfr.fit(no_cat_x_train, no_cat_y_train)

RandomForestRegressor(random_state=0)

In [None]:
rfr_y_pred = rfr.predict(x_test)
no_cat_rfr_y_pred = no_cat_rfr.predict(no_cat_x_test)

In [None]:
print(np.concatenate((rfr_y_pred.reshape(len(rfr_y_pred), 1),
                      y_test.reshape(len(rfr_y_pred), 1)), 1))

In [None]:
print(np.concatenate((no_cat_rfr_y_pred.reshape(len(no_cat_rfr_y_pred), 1),
                      no_cat_y_test.reshape(len(no_cat_rfr_y_pred), 1)), 1))

Multiple Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

no_cat_lr = LinearRegression()
no_cat_lr.fit(no_cat_x_train, no_cat_y_train)

LinearRegression()

In [None]:
lr_y_pred = lr.predict(x_test)
no_cat_lr_y_pred = no_cat_lr.predict(no_cat_x_test)

In [None]:
print(np.concatenate((lr_y_pred.reshape(len(lr_y_pred), 1),
                      y_test.reshape(len(lr_y_pred), 1)), 1))

In [None]:
print(np.concatenate((no_cat_lr_y_pred.reshape(len(no_cat_lr_y_pred), 1),
                      y_test.reshape(len(no_cat_lr_y_pred), 1)), 1))

Metrics

In [31]:
from sklearn.metrics import r2_score, mean_squared_error

scores: dict[str, tuple[float, float]] = {}

# Scoring the x with categorical data
cat_models = {
    'Decision Tree Regression with Minimum Sample Split of 2': dtr_2,
    'Decision Tree Regression with Minimum Sample Split of 20': dtr_20,
    'Random Forest Regression': rfr,
    'Multiple Linear Regression': lr
}
for key, model in cat_models.items():
    predicted_y = model.predict(x_test)
    r_squared = r2_score(y_test, predicted_y)
    sum_of_squared_residuals = mean_squared_error(y_test, predicted_y)
    scores[key] = (r_squared, sum_of_squared_residuals)

# Scoring the x with categorical data
no_cat_models = {
    'Decision Tree Regression with Minimum Sample Split of 2 w/o Categorical Variables': no_cat_dtr_2,
    'Decision Tree Regression with Minimum Sample Split of 20 w/o Categorical Variables': no_cat_dtr_20,
    'Random Forest Regression w/o Categorical Variables': no_cat_rfr,
    'Multiple Linear Regression w/o Categorical Variables': no_cat_lr
}
for key, model in no_cat_models.items():
    predicted_y = model.predict(no_cat_x_test)
    r_squared = r2_score(no_cat_y_test, predicted_y)
    sum_of_squared_residuals = mean_squared_error(no_cat_y_test, predicted_y)
    scores[key] = (r_squared, sum_of_squared_residuals)

print('Ranked models based on r^2 score')
for i, (key, (r_squared, ssr)) in enumerate(sorted(scores.items(), key=lambda tup: tup[1][0], reverse=True), start=1):
    print(f'{i}) Model: {key}', f'r^2 score: {round(r_squared * 100, 2)}%', f'Sum of Squared Residuals: {round(ssr, 2)}', sep='\n\t')
    print()

Ranked models based on r^2 score
1) Model: Random Forest Regression w/o Categorical Variables
	r^2 score: 95.32%
	Sum of Squared Residuals: 32433193.58

2) Model: Random Forest Regression
	r^2 score: 95.18%
	Sum of Squared Residuals: 33363722.1

3) Model: Decision Tree Regression with Minimum Sample Split of 20 w/o Categorical Variables
	r^2 score: 94.38%
	Sum of Squared Residuals: 38916698.99

4) Model: Decision Tree Regression with Minimum Sample Split of 20
	r^2 score: 94.37%
	Sum of Squared Residuals: 38973886.77

5) Model: Decision Tree Regression with Minimum Sample Split of 2 w/o Categorical Variables
	r^2 score: 93.77%
	Sum of Squared Residuals: 43186139.22

6) Model: Linear Regression w/o Categorical Variables
	r^2 score: 90.17%
	Sum of Squared Residuals: 68132002.35

7) Model: Multiple Linear Regression
	r^2 score: 78.49%
	Sum of Squared Residuals: 149040477.81

8) Model: Decision Tree Regression with Minimum Sample Split of 2
	r^2 score: 76.94%
	Sum of Squared Residuals: 159