In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer


%matplotlib inline

In [None]:
df = pd.read_csv("../data/clean_data.csv")

df.head()

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2,random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=1)

df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)

y_train = df_train["amount"].values
y_val = df_val["amount"].values
y_test = df_test["amount"].values

del df_train["amount"]
del df_val["amount"]
del df_test["amount"]

cat = [  "provider", "countrycode", "market"]
num = ["usercurrencyamount", "coins"]   

In [None]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(df_train[cat + num].to_dict(orient="records"))
X_val = dv.transform(df_val[cat + num].to_dict(orient="records"))

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_val)
print(f"Accuracy: {mean_squared_error(y_val,y_pred)}")
print(f"R^2: {r2_score(y_val,y_pred)}")

In [None]:
from sklearn.linear_model import Ridge

alphas = [0, 0.01, 0.1, 1, 10]
solvers = ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]

scores = []
for s in solvers:
    for a in alphas:
        lr = Ridge(alpha=a, solver=s)
        lr.fit(X_train,y_train)
        y_pred = lr.predict(X_val)
        scores.append((s,a,mean_squared_error(y_val,y_pred),r2_score(y_val,y_pred)))
df_scores = pd.DataFrame(scores,columns=["solver","alpha","score","r2"])
df_scores.sort_values(by=["score","r2"], ascending=True).head(10)

In [None]:
for s in solvers:
    if s in ["svd", "cholesky", "lsqr"]:
        plt.plot(df_scores[df_scores["solver"] == s]["alpha"],df_scores[df_scores["solver"] == s]["score"], label=s)

plt.legend()

# Best combo for Ridge seems solver="svd", alpha=1.0

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depths = [1, 2, 3, 4, 5, 6, 7, None]
min_sample_leafs = [1,2, 5, 10,20]

scores = []
for s in min_sample_leafs:
    for d in [1, 2, 3, 4, 5, 6, 7, None]:
        dt = DecisionTreeRegressor(max_depth=d, random_state=1, min_samples_leaf=s)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_val)
        scores.append((d,s,mean_squared_error(y_val,y_pred),r2_score(y_val,y_pred)))
df_scores = pd.DataFrame(scores,columns=["max_depth","min_samples_leaf","score","r2"])
df_scores.sort_values(by=["score","r2"], ascending=True).head(10)

# Best combo for DecisionTreeRegressor seems max_depth=7, min_samples_leaf=1
# Leaving max_depth=None will cause overfitting

In [None]:
from sklearn.ensemble import RandomForestRegressor

scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    scores.append((n, mean_squared_error(y_val, y_pred), r2_score(y_val, y_pred)))
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'score', 'r2'])
df_scores.sort_values(by=["score","r2"], ascending=True).head(10)

In [None]:
plt.plot(df_scores["n_estimators"],df_scores["score"])

# Best performance starts around n_estimators=110

In [None]:
scores = []
max_depths = [1, 2, 3, 4, 5, 6, 7, None]
n_estimators = list(range(100,180,10))

for d in max_depths:
    for n in n_estimators:
        rf = RandomForestRegressor(n_estimators=n,max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        scores.append((n, d, mean_squared_error(y_val, y_pred), r2_score(y_val, y_pred)))
df_scores = pd.DataFrame(scores, columns=['n_estimators','max_depth', 'score', 'r2'])
df_scores.sort_values(by=["score","r2"], ascending=True).head(10)

In [None]:
for d in [5,6,7]:
    plt.plot(df_scores[df_scores["max_depth"]==d]["n_estimators"],df_scores[df_scores["max_depth"]==d]["score"], label=f"max_depth={d}")

plt.legend()

# Best performance starts around n_estimators=110 with a max_depth=7

In [None]:
import xgboost as xgb

scores = {}

def parse_xgboost_output(s):
    lines = s.strip().split('\n')
    eval_data = []
    for line in lines:
        num_iter, train_score, val_score = line.split('\t')
        num_iter = int(num_iter.strip('[]'))
        train_score = float(train_score.split(':')[1])
        val_score = float(val_score.split(':')[1])
        eval_data.append((num_iter, train_score, val_score))

    df_eval = pd.DataFrame(eval_data, columns=['tree', 'train_score', 'val_score'])
    return df_eval

features = dv.get_feature_names_out().tolist()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [None]:
%%capture output


xgb_params = {
    'eta': 1.0,
    'max_depth': 7,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=200, evals=watchlist, verbose_eval=5)
key = f'eta={str(xgb_params["eta"])}'
scores[key] = parse_xgboost_output(output.stdout)

In [None]:
for k, df in scores.items():
    plt.plot(df.tree, df.val_score, label=k)
plt.legend()

In [None]:
%%capture output


xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=200, evals=watchlist, verbose_eval=5)
key = f'max_depth={str(xgb_params["max_depth"])}'
scores[key] = parse_xgboost_output(output.stdout)

In [None]:
for k, df in scores.items():
    plt.plot(df.tree, df.val_score, label=k)
plt.legend()

In [None]:
%%capture output


xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 10,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=200, evals=watchlist, verbose_eval=5)
key = f'min_child_weight={str(xgb_params["min_child_weight"])}'
scores[key] = parse_xgboost_output(output.stdout)

In [None]:
for k, df in scores.items():
    plt.plot(df.tree, df.val_score, label=k)
plt.legend()

Best models:

- RandomForestRegressor with *n_estimators=110* and *max_depth=7*
- XGBoost with following settings:
  
  ```python
  xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 10,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
  ```

In [None]:
df_full_train.reset_index(drop=True)

y_full_train = df_full_train["amount"].values

del df_full_train["amount"]


X_full_train = dv.fit_transform(df_full_train[cat + num].to_dict(orient="records"))
X_test = dv.transform(df_test[cat + num].to_dict(orient="records"))

In [None]:
rf = RandomForestRegressor(n_estimators=110, max_depth=7, random_state=1)
rf.fit(X_full_train, y_full_train)
y_pred = rf.predict(X_test)

mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

In [None]:
features = dv.get_feature_names_out().tolist()
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 10,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
model = xgb.train(xgb_params, dfulltrain, num_boost_round=200)
y_pred = model.predict(dtest)

mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred)

In [None]:
# Save random forest as pickle
import pickle

with open('../model/rf.bin', 'wb') as f_out:
    pickle.dump((dv, rf), f_out)