In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import math
import gc
import seaborn as sns
import lightgbm as lgbm 
import xgboost as xgb

from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
from sklearn.preprocessing import StandardScaler


In [None]:
df = pd.read_csv('./data/df_stat.csv')

from sklearn.metrics import mean_squared_error
def metric(y,x):
    return round(np.sqrt(mean_squared_error(x,y)),3)

df = pd.get_dummies(df, columns=['location'], drop_first=True)

RMSE_train = []
RMSE_test = []

In [None]:
X = df.drop(['target', 'ID'], axis=1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

columns = X_train.columns

In [None]:
scalerStand = StandardScaler()
X_train = scalerStand.fit_transform(X_train)
X_test = scalerStand.transform(X_test)

# scaled X_train data to pandas Dataframe
X_train = pd.DataFrame(X_train)
X_train.columns = columns
X_train.drop('Unnamed: 0', axis=1, inplace=True)

# scaled X_test data to pandas Dataframe
X_test = pd.DataFrame(X_test)
X_test.columns = columns
X_test.drop('Unnamed: 0', axis=1, inplace=True)

## Random Forest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr = rfr.fit(X_train, y_train)

# predict on test-set
y_pred_rfr = rfr.predict(X_test)
y_pred_train_rfr = rfr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_rfr))
print("Test RMSE:", metric(y_test, y_pred_rfr))
RMSE_train.append(metric(y_train, y_pred_train_rfr))
RMSE_test.append(metric(y_test, y_pred_rfr))

In [None]:
importances = rfr.feature_importances_
forest_importances = pd.Series(importances)
fi = pd.DataFrame(forest_importances)
fi = fi.set_index(columns[1:])
fi.plot.bar(figsize=(15,10))

## KNN

In [None]:
knn = KNeighborsRegressor()
knn = knn.fit(X_train, y_train)
# predict on test-set
y_pred_knn = knn.predict(X_test)
y_pred_train_knn = knn.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_knn))
print("Test RMSE:", metric(y_test, y_pred_knn))
RMSE_train.append(metric(y_train, y_pred_train_knn))
RMSE_test.append(metric(y_test, y_pred_knn))

## Plot different RMSE

In [None]:
RMSE = {'RMSE_train': RMSE_train, 'RMSE_test': RMSE_test, 'Model':['Random Forest','KNN']}
baseline = pd.DataFrame(RMSE)
baseline = baseline.set_index('Model')
baseline.plot.bar()