# Data Preparation

Due to the limited data (2142 instances), we decided to train 10 different implementations of the Random Forest algorithm with a 90:10 split each. This way, we can make predictions for all cities in Switzerland with unbiased models (a single city is never in the train set which was used for the prediction).

In [None]:
import pandas as pd

data = pd.read_csv('data.csv')

In [None]:
X = data.drop("Anzahl Filialen Migros", axis=1)
y = data["Anzahl Filialen Migros"]

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=0)

In [None]:
for train_index, test_index in kf.split(data):
  print("TRAIN:", len(train_index), "TEST:", len(test_index))

TRAIN: 1927 TEST: 215
TRAIN: 1927 TEST: 215
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214
TRAIN: 1928 TEST: 214


In [None]:
splits = list(kf.split(X))

# Model Implemetation

In [None]:
from sklearn.model_selection import GridSearchCV
import time

In [None]:
predictions = data[["Anzahl Filialen Migros"]].copy()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
predLinregModel = pd.DataFrame(columns=["linregModel"])

for train, test in splits:
  X_train = X.iloc[train]
  y_train = y.iloc[train]
  X_test = X.iloc[test]
  linregModel = LinearRegression().fit(X_train, y_train)
  predLinregModel_temp = pd.DataFrame(index=test, columns=["linregModel"], data=linregModel.predict(X_test).round(0))
  predLinregModel_temp.linregModel.clip(lower=0, inplace=True)
  predLinregModel = predLinregModel.append(predLinregModel_temp)

In [None]:
predLinregModel.sort_index(inplace=True)
predLinregModel

Unnamed: 0,linregModel
0,1.0
1,5.0
2,1.0
3,2.0
4,1.0
...,...
2137,1.0
2138,0.0
2139,-0.0
2140,1.0


## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knnparams = {'n_neighbors': [2, 3, 5, 10, 20, 50, 75, 100, 200, 500],
          'weights': ['distance', 'uniform'],
          'algorithm':['auto']}

In [None]:
predKnnModel = pd.DataFrame(columns=["knnModel"])

In [None]:
for train, test in splits:
  start_time = time.time()
  X_train = X.iloc[train]
  y_train = y.iloc[train]
  X_test = X.iloc[test]
  knnModel = KNeighborsRegressor()
  knnModelGS = GridSearchCV(knnModel, param_grid=knnparams).fit(X_train, y_train)
  knnModelGS.fit(X_train, y_train)
  predKnnModel_temp = pd.DataFrame(index=test, columns=["knnModel"], data=knnModelGS.predict(X_test).round(0))
  predKnnModel = predKnnModel.append(predKnnModel_temp)
  print(time.time() - start_time, knnModelGS.best_params_)

10.799211263656616 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
12.614235401153564 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
9.049874544143677 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}
5.610302686691284 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
5.700590372085571 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'distance'}
5.785508632659912 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
5.547427654266357 {'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'}
6.047724962234497 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
5.80324649810791 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}
6.018804550170898 {'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}


In [None]:
predKnnModel.sort_index(inplace=True)
predKnnModel

Unnamed: 0,knnModel
0,1.0
1,5.0
2,1.0
3,1.0
4,1.0
...,...
2137,0.0
2138,0.0
2139,0.0
2140,0.0


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfparams = {'max_depth': [3, 5, 10, 20, 50, 100],
 'max_features': ['auto'],
 'min_samples_leaf': [1, 2, 4, 8]}

In [None]:
predRfModel = pd.DataFrame(columns=["rfModel"])

In [None]:
for train, test in splits:
  start_time = time.time()
  X_train = X.iloc[train]
  y_train = y.iloc[train]
  X_test = X.iloc[test]
  rfModel = RandomForestRegressor()
  rfModelGS = GridSearchCV(rfModel, param_grid=rfparams).fit(X_train, y_train)
  rfModelGS.fit(X_train, y_train)
  predRfModel_temp = pd.DataFrame(index=test, columns=["rfModel"], data=rfModelGS.predict(X_test).round(0))
  predRfModel = predRfModel.append(predRfModel_temp)
  print(time.time() - start_time, rfModelGS.best_params_)

339.0304181575775 {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1}
342.4888114929199 {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1}
332.06808280944824 {'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 2}
332.9603178501129 {'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 1}
330.5932309627533 {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1}
323.11641001701355 {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1}
335.99875020980835 {'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1}
336.6722731590271 {'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 1}
335.9699671268463 {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1}
332.9346339702606 {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1}


In [None]:
predRfModel.sort_index(inplace=True)
predRfModel

Unnamed: 0,rfModel
0,1.0
1,5.0
2,1.0
3,1.0
4,1.0
...,...
2137,1.0
2138,1.0
2139,0.0
2140,0.0


## XGBRegressor

In [None]:
from xgboost import XGBRegressor

In [None]:
xgbrparams = {'objective':['reg:squarederror'],
          'n_estimators': [10, 20, 50, 100, 200],
          'max_depth': [3, 5, 10, 20, 50],
          'eta': [0.005, 0.01, 0.02, 0.05],
          'subsample': [0.6],
          'colsample_bytree': [0.7],
          }

In [None]:
predXgbrModel = pd.DataFrame(columns=["xgbrModel"])

In [None]:
for train, test in splits:
  start_time = time.time()
  X_train = X.iloc[train]
  y_train = y.iloc[train]
  X_test = X.iloc[test]
  xgbrModel = XGBRegressor()
  xgbrModelGS = GridSearchCV(xgbrModel, param_grid=xgbrparams).fit(X_train, y_train)
  xgbrModelGS.fit(X_train, y_train)
  predXgbrModel_temp = pd.DataFrame(index=test, columns=["xgbrModel"], data=xgbrModelGS.predict(X_test).round(0))
  predXgbrModel = predXgbrModel.append(predXgbrModel_temp)
  print(time.time() - start_time, xgbrModelGS.best_params_)

529.4494888782501 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 3, 'n_estimators': 50, 'objective': 'reg:squarederror', 'subsample': 0.6}
510.94258284568787 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 5, 'n_estimators': 20, 'objective': 'reg:squarederror', 'subsample': 0.6}
517.4552698135376 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 3, 'n_estimators': 50, 'objective': 'reg:squarederror', 'subsample': 0.6}
510.6790828704834 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 3, 'n_estimators': 50, 'objective': 'reg:squarederror', 'subsample': 0.6}
511.8976106643677 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 5, 'n_estimators': 20, 'objective': 'reg:squarederror', 'subsample': 0.6}
511.81496596336365 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 10, 'n_estimators': 200, 'objective': 'reg:squarederror', 'subsample': 0.6}
513.7959661483765 {'colsample_bytree': 0.7, 'eta': 0.005, 'max_depth': 3, 'n_estimators': 50, 'objective': 'reg:squarederror',

In [None]:
predXgbrModel.sort_index(inplace=True)
predXgbrModel

Unnamed: 0,xgbrModel
0,1.0
1,5.0
2,1.0
3,2.0
4,1.0
...,...
2137,1.0
2138,0.0
2139,0.0
2140,0.0


## Combining all results

In [None]:
predictions = predictions.merge(predLinregModel, how='left', left_index=True, right_index=True)
predictions = predictions.merge(predKnnModel, how='left', left_index=True, right_index=True)
predictions = predictions.merge(predRfModel, how='left', left_index=True, right_index=True)
predictions = predictions.merge(predXgbrModel, how='left', left_index=True, right_index=True)
predictions["ensemble"] = predictions[['linregModel', 'knnModel', 'rfModel', 'xgbrModel']].mean(axis=1).round(0)

In [None]:
predictions

Unnamed: 0,Anzahl Filialen Migros,linregModel,knnModel,rfModel,xgbrModel,ensemble
0,1.0,1.0,1.0,1.0,1.0,1.0
1,4.0,5.0,5.0,5.0,5.0,5.0
2,0.0,1.0,1.0,1.0,1.0,1.0
3,1.0,2.0,1.0,1.0,2.0,2.0
4,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...
2137,2.0,1.0,0.0,1.0,1.0,1.0
2138,1.0,0.0,0.0,1.0,0.0,0.0
2139,0.0,-0.0,0.0,0.0,0.0,0.0
2140,1.0,1.0,0.0,0.0,0.0,0.0


In [None]:
predictions.to_csv("predictions.csv", index=False)