In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


# Clean data

In [3]:
def clean_data(df) -> pd.DataFrame:
    df = df.copy()
    df.drop('row_id',axis=1, inplace=True) # Not relevant

    df['date'] = pd.to_datetime(df['date'])

    # Encode countries
    le = LabelEncoder().fit(df['country'])
    df['country'] = le.transform(df['country'])
    
    # Encode stores
    le = LabelEncoder().fit(df['store'])
    df['store'] = le.transform(df['store'])
    
    # Encode product
    le = LabelEncoder().fit(df['product'])
    df['product'] = le.transform(df['product'])

    df['date'] = df['date'].values.astype(float)

    return df

# Split and train model
Current nr 1 on Kaggle has score of 4.7, Goal: get under 5

In [4]:
train = clean_data(train_df)

train_X_df = train.drop('num_sold', axis=1)
train_y_df = train['num_sold']
X_train, X_test, y_train, y_test = train_test_split(train_X_df, train_y_df, test_size=0.2)

### Accuracy is calculated using SMAPE

In [5]:
def accuracy(real,preds):
    """Function to calculate accuracy using SMAPE"""
    return 1/len(real) * np.sum(2 * np.abs(real - preds) / (np.abs(real) + np.abs(preds)) * 100)

### Test linear regression as baseline

In [8]:
%%time
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', LinearRegression())
    ])
clf.fit(X_train, y_train)

print(f"Linear regression score on training data: {accuracy(clf.predict(X_train), y_train)}")
print(f"Linear regression score on testing data: {accuracy(clf.predict(X_test), y_test)}")

Linear regression score on training data: 35.78019325932542
Linear regression score on testing data: 35.16904783647501
CPU times: user 47.3 ms, sys: 13.2 ms, total: 60.5 ms
Wall time: 49.3 ms


### Test of Nearest neighbors regression
Did a lot better than Linear regression with a score of 12.35 but long way to go...

In [10]:
%%time
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

best_test = float('inf')
best_train = float('inf')
best_k = 0
for neighbor in range(1, 101):
    clf = Pipeline([
        ('scaler', RobustScaler()),
        ('knn', KNeighborsRegressor(n_neighbors=neighbor))
    ])
    clf.fit(X_train, y_train)
    test_score = accuracy(clf.predict(X_test), y_test)
    train_score = accuracy(clf.predict(X_train), y_train)
    if test_score < best_test:
        best_test = test_score
        best_k = neighbor
        best_train = train_score
        
print(f"Accuracy at best k on training data: {train_score}")
print(f"Best accuracy at k = {best_k} with testing accuracy of {test_score}")

Accuracy at best k on training data: 12.162531429984686
Best accuracy at k = 2 with testing accuracy of 12.347593150801833
CPU times: user 57 s, sys: 2.54 s, total: 59.6 s
Wall time: 1min 3s


### Test of Random Forest Regressor
Got a testing accuracy of 8.9 without any hyperparameter tuning, getting alot closer

In [11]:
%%time
from sklearn.ensemble import RandomForestRegressor

test_estimators = False
best_acc = float('inf')
best_estimator = 0
train_acc_at_best_k = float('inf')
for estimator in range(10, 200, 10):
    model = RandomForestRegressor(n_estimators=estimator, n_jobs=-1)
    model.fit(X_train, y_train)
    curr_accuracy = accuracy(model.predict(X_test), y_test)
    if curr_accuracy < best_acc:
        best_acc = curr_accuracy
        best_estimator = estimator
        train_acc_at_best_k = accuracy(model.predict(X_train), y_train)

print(f"Accuracy at best estimator on training data: {train_acc_at_best_k}")
print(f"Best accuracy at estimator = {best_estimator} with testing accuracy of {best_acc}")

Accuracy at best estimator on training data: 3.3250639093185232
Best accuracy at estimator = 180 with testing accuracy of 9.141126743853231
CPU times: user 1min 9s, sys: 3.19 s, total: 1min 12s
Wall time: 27.2 s


### Test of sklearns GradientBoostingRegressor
Best testing score yet of 6.9, more hyperparameter tuning available to perfect

In [14]:
%%time
from sklearn.ensemble import GradientBoostingRegressor

clf = Pipeline([
        ('scaler', RobustScaler()),
        ('gbr', GradientBoostingRegressor(n_estimators=10000))
    ])
clf.fit(X_train, y_train)

print(f"Gradient boost regression score on training data: {accuracy(clf.predict(X_train), y_train)}")
print(f"Gradient boost regression score on testing data: {accuracy(clf.predict(X_test), y_test)}")

Gradient boost regression score on training data: 5.624766747555697
Gradient boost regression score on testing data: 6.915119405181871
CPU times: user 1min 18s, sys: 1.23 s, total: 1min 20s
Wall time: 1min 32s


### Test of XGBoost

In [14]:
%%time
from xgboost import XGBRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import make_scorer

params = {
            'regressor__learning_rate': np.linspace(0.03, 0.5, 10), 
            'regressor__max_depth': np.linspace(5,7,3, dtype=int),
            'regressor__min_child_weight': [1, 3],
            'regressor__subsample': [0.8, 1],
            'regressor__colsample_bytree': [0.7, 1],
            'regressor__n_estimators': [500, 1000]
}


pipe = clf = Pipeline(steps=[
        ('scaler', RobustScaler()),
        ('regressor', XGBRegressor())
    ])
    
search = HalvingGridSearchCV(pipe, params, n_jobs=-1, scoring=make_scorer(accuracy, greater_is_better=False))
search.fit(X_train, y_train)

print(search.best_params_)

print(f"Gradient boost regression score on training data: {accuracy(search.predict(X_train), y_train)}")
print(f"Gradient boost regression score on testing data: {accuracy(search.predict(X_test), y_test)}")