In [1]:
import numpy as np
import pandas as pd

RANDOM_STATE = 0

np.set_printoptions(precision=2)

Data Cleaning

In [2]:
data = pd.read_csv('output_file.csv', encoding='latin-1')

# Converts columns from strings to integers
int_cols = ['FY2021_retail_revenue', 'FY2021_parent_company/ group revenue', 'Geographies_of_operation']
data[int_cols] = data[int_cols].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce')

# Converts percentage strings to floats
float_cols = ['FY2016- 2021_retail_revenueCAGR3', 'FY2020-2021_retail_revenue_growth', 'FY2021_Net_profit_margin']
data[float_cols] = data[float_cols].replace('%', '', regex=True).apply(pd.to_numeric, errors='coerce') / 100.0

# Strip whitespace from strings
str_cols = ['Location', 'Dominant_operational_format']
data[str_cols] = data[str_cols].apply(lambda x: x.str.strip())

In [None]:
data

In [3]:
# All columns except Rank, Name of Company, and FY2021_retail_revenue
x = data.iloc[:, [2, 4, 5, 6, 7, 8, 9]].values

# FY2021_retail_revenue column
y = data.iloc[:, 3].values

In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Apply imputer to all features except Location and Geographies of Operation
x[:, [1, 3, 4, 5, 6]] = imputer.fit_transform(x[:, [1, 3, 4, 5, 6]])

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Will encode the Location Column into 40 separate columns and the Dominant Operational Format into 14
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2])], remainder='passthrough')
x = ct.fit_transform(x).toarray()

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_STATE)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(min_samples_split=20, random_state=RANDOM_STATE)
dtr.fit(x_train, y_train)

DecisionTreeRegressor(min_samples_split=20, random_state=0)

In [9]:
dtr_y_pred = dtr.predict(x_test)
print(np.concatenate((dtr_y_pred.reshape(len(dtr_y_pred), 1), y_test.reshape(len(dtr_y_pred), 1)), 1))

[[  5385.21052632   5233.        ]
 [ 10433.33333333   9509.        ]
 [ 13640.5         13242.        ]
 [  8238.83333333   7650.        ]
 [  8238.83333333   7757.        ]
 [  8238.83333333   7372.        ]
 [  6077.77777778   6151.        ]
 [ 13640.5         14500.        ]
 [ 19880.38888889  18089.        ]
 [ 10433.33333333   7718.        ]
 [  6077.77777778   5436.        ]
 [  5385.21052632   5150.        ]
 [ 10433.33333333  11200.        ]
 [  6077.77777778   6426.        ]
 [ 10433.33333333  10039.        ]
 [ 15726.54545455  16488.        ]
 [  6077.77777778   5983.        ]
 [  7542.75         8204.        ]
 [  8238.83333333   7859.        ]
 [ 11793.85714286  12731.        ]
 [  8238.83333333   7664.        ]
 [ 19880.38888889  24793.        ]
 [  4732.85714286   4916.        ]
 [ 10433.33333333  10700.        ]
 [  8238.83333333   7882.        ]
 [ 13640.5         13593.        ]
 [101811.66666667 120947.        ]
 [ 19880.38888889  21000.        ]
 [  6077.77777778   

In [10]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=RANDOM_STATE)
rfr.fit(x_train, y_train)

RandomForestRegressor(random_state=0)

In [11]:
rfr_y_pred = rfr.predict(x_test)
print(np.concatenate((rfr_y_pred.reshape(len(rfr_y_pred), 1), y_test.reshape(len(rfr_y_pred), 1)), 1))

[[  5243.73   5233.  ]
 [  9273.88   9509.  ]
 [ 13688.82  13242.  ]
 [  7336.17   7650.  ]
 [  7844.79   7757.  ]
 [  8098.6    7372.  ]
 [  5950.1    6151.  ]
 [ 14406.48  14500.  ]
 [ 19272.76  18089.  ]
 [ 10728.76   7718.  ]
 [  6314.36   5436.  ]
 [  5300.42   5150.  ]
 [ 11017.99  11200.  ]
 [  6342.86   6426.  ]
 [ 10145.81  10039.  ]
 [ 16821.74  16488.  ]
 [  6355.76   5983.  ]
 [  8475.71   8204.  ]
 [  7924.97   7859.  ]
 [ 12311.85  12731.  ]
 [  7764.09   7664.  ]
 [ 19408.2   24793.  ]
 [  4974.     4916.  ]
 [ 10289.06  10700.  ]
 [  7624.77   7882.  ]
 [ 12481.23  13593.  ]
 [110570.49 120947.  ]
 [ 19187.44  21000.  ]
 [  6388.38   6298.  ]
 [ 28829.43  30035.  ]
 [  5902.82   5942.  ]
 [ 14319.88  14979.  ]
 [ 10247.5   10986.  ]
 [  6580.9    6773.  ]
 [  7028.66   7203.  ]
 [  7978.8    8500.  ]
 [  8655.35   8891.  ]
 [101011.71 136971.  ]
 [ 43639.72  48550.  ]
 [  9025.77   9242.  ]
 [  9946.11   9690.  ]
 [ 80277.7   89381.  ]
 [ 18894.52  11217.  ]
 [  6581.37

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

In [None]:
lr_y_pred = lr.predict(x_test)
print(np.concatenate((lr_y_pred.reshape(len(lr_y_pred), 1), y_test.reshape(len(lr_y_pred), 1)), 1))

Metrics

In [12]:
from sklearn.metrics import r2_score, mean_squared_error

In [13]:
# Decision Tree Regression Metrics

dtr_r_squared = r2_score(y_test, dtr_y_pred)
dtr_mean_squared_error = mean_squared_error(y_test, dtr_y_pred)

print(f'r^2: {dtr_r_squared}; sum of squared residuals: {dtr_mean_squared_error}')

r^2: 0.9437406236715923; sum of squared residuals: 38973886.77177612


In [14]:
# Random Forest Regressor

rfr_r_squared = r2_score(y_test, rfr_y_pred)
rfr_mean_squared_error = mean_squared_error(y_test, rfr_y_pred)

print(f'r^2: {rfr_r_squared}; sum of squared residuals: {rfr_mean_squared_error}')

r^2: 0.9518389785358993; sum of squared residuals: 33363722.10025999


In [None]:
lr_r_squared = r2_score(y_test, lr_y_pred)
lr_mean_squared_error = mean_squared_error(y_test, lr_y_pred)

print(f'r^2: {lr_r_squared}; sum of squared residuals: {lr_mean_squared_error}')