In [1]:
import numpy as np
import pandas as pd

RANDOM_STATE = 0

Data Cleaning

In [2]:
data = pd.read_csv('leading_retailers_2021.csv', encoding='latin-1')

# Converts columns from strings to integers
int_cols = ['FY2021_retail_revenue', 'FY2021_parent_company/ group revenue', 'Geographies_of_operation']
data[int_cols] = data[int_cols].replace(',', '', regex=True).apply(pd.to_numeric, errors='coerce')

# Converts percentage strings to floats
float_cols = ['FY2016- 2021_retail_revenueCAGR3', 'FY2020-2021_retail_revenue_growth', 'FY2021_Net_profit_margin']
data[float_cols] = data[float_cols].replace('%', '', regex=True).apply(pd.to_numeric, errors='coerce') / 100.0

# Strip whitespace from strings
str_cols = ['Location', 'Dominant_operational_format']
data[str_cols] = data[str_cols].apply(lambda x: x.str.strip())

In [3]:
x = data.iloc[:, [2, 4, 5, 6, 7, 8, 9]].values

# Dominant operational format column
y = data.iloc[:, 3].values

In [26]:
len(data['Dominant_operational_format'].unique())

14

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x[:, [1, 3, 4, 5, 6]] = imputer.fit_transform(x[:, [1, 3, 4, 5, 6]])

In [22]:
x

array([['United States', 572754.0, 'Hypermarket/ supercenter', ...,
        0.033, 0.024, 0.024],
       ['United States', 469822.0, 'Non-store', ..., 0.204, 0.12, 0.071],
       ['United States', 195929.0, 'Cash & carry/ warehouse club', ...,
        0.105, 0.175, 0.026000000000000002],
       ...,
       ['Japan', 4889.0, 'Discount department store', ..., 0.055,
        0.044000000000000004, 0.04248947368421053],
       ['Israel', 4561.0, 'Discount store', ..., 0.046, -0.031,
        0.027000000000000003],
       ['United Kingdom', 4478.0, 'Other specialty', ..., 0.158, 0.196,
        0.095]], dtype=object)

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2])], remainder='passthrough')
x = ct.fit_transform(x).toarray()  # Encodes Location Column into 40 separate columns and the Dominant Operational Format into 14

array([0.024, 0.071, 0.026000000000000002, 0.04248947368421053, 0.109],
      dtype=object)