In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import math
import gc
import seaborn as sns
#import lightgbm as lgbm 
#import xgboost as xgb

from tqdm import tqdm
from sklearn.linear_model import LinearRegression
#from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
from sklearn.preprocessing import StandardScaler

#import catboost as cat

In [None]:
df = pd.read_csv('./data/df_stat.csv')

In [None]:
from sklearn.metrics import mean_squared_error
def metric(y,x):
    return round(np.sqrt(mean_squared_error(x,y)),3)

In [None]:
df = pd.get_dummies(df, columns=['location'], drop_first=True)

In [None]:
df.head(1)

In [None]:
RMSE_train = []
RMSE_test = []

In [None]:
X = df.drop(['target', 'ID'], axis=1)
y = df.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
columns = X_train.columns
columns

In [None]:
scalerStand = StandardScaler()
X_train = scalerStand.fit_transform(X_train)
X_test = scalerStand.transform(X_test)

scalerMinMax = MinMaxScaler()
X_train = scalerMinMax.fit_transform(X_train)
X_test = scalerMinMax.transform(X_test)

In [None]:
# scaled X_train data to pandas Dataframe
X_train = pd.DataFrame(X_train)
X_train.columns = columns
X_train.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# scaled X_test data to pandas Dataframe
X_test = pd.DataFrame(X_test)
X_test.columns = columns
X_test.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
# predict on test-set
y_pred_lr = lr.predict(X_test)
y_pred_train_lr = lr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_lr))
print("Test RMSE:", metric(y_test, y_pred_lr))
RMSE_train.append(metric(y_train, y_pred_train_lr))
RMSE_test.append(metric(y_test, y_pred_lr))

In [None]:
pd.DataFrame(X_train)

## KNN

In [None]:
knn = KNeighborsRegressor()
knn = knn.fit(X_train, y_train)
# predict on test-set
y_pred_knn = knn.predict(X_test)
y_pred_train_knn = knn.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_knn))
print("Test RMSE:", metric(y_test, y_pred_knn))
RMSE_train.append(metric(y_train, y_pred_train_knn))
RMSE_test.append(metric(y_test, y_pred_knn))


## Random Forest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr = rfr.fit(X_train, y_train)

# predict on test-set
y_pred_rfr = rfr.predict(X_test)
y_pred_train_rfr = rfr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_rfr))
print("Test RMSE:", metric(y_test, y_pred_rfr))
RMSE_train.append(metric(y_train, y_pred_train_rfr))
RMSE_test.append(metric(y_test, y_pred_rfr))

In [None]:
importances = rfr.feature_importances_
forest_importances = pd.Series(importances)
fi = pd.DataFrame(forest_importances)
fi = fi.set_index(columns[1:])
fi.plot.bar(figsize=(15,10))


In [None]:

fi_sort = fi.reset_index()
fi_sort.columns =['feature', 'importance']
fi_sort = fi_sort.sort_values('importance', ascending=False)
feat = fi_sort.feature.to_list()
feat

In [None]:
location = X_test[['location_B','location_C','location_D','location_E']]
#loc = X_test[['location_B','location_C','location_D','location_E']].idxmax(axis=1)

location.loc[(location['location_B'] == 0) & (location['location_C'] == 0) & (location['location_D'] == 0) & (location['location_E'] == 0), 'location_A'] = 1
location.location_A = location.location_A.fillna(0)
location = location.idxmax(axis=1)


In [None]:
residuen = y_pred_rfr - y_test
sns.scatterplot(x=y_test ,y=residuen, hue= location).set(ylabel='Residuals')

In [None]:
residuen.shape

In [None]:
#X, y = make_regression(n_features=48)
#rfr_b = RandomForestRegressor(random_state=42)
#rfr_b.fit(X, y)
#y_pred_rfr_b = rfr_b.predict(X_test)
#y_pred_train_rfr_b = rfr_b.predict(X_train)
#print("Train RMSE:", metric(y_train, y_pred_train_rfr_b))
#print("Test RMSE:", metric(y_test, y_pred_rfr_b))
#RSME_train.append(metric(y_train, y_pred_train_rfr_b))
#RSME_test.append(metric(y_test, y_pred_rfr_b))

## Support Vector Machine

In [None]:
svr = SVR()
svr = svr.fit(X_train, y_train)

# predict on test-set
y_pred_svr = svr.predict(X_test)
y_pred_train_svr = svr.predict(X_train)

# RMSE on train and test set
print("Train RMSE:", metric(y_train, y_pred_train_svr))
print("Test RMSE:", metric(y_test, y_pred_svr))
RMSE_train.append(metric(y_train, y_pred_train_svr))
RMSE_test.append(metric(y_test, y_pred_svr))

In [None]:
RMSE = {'RMSE_train': RMSE_train, 'RMSE_test': RMSE_test, 'Model':['Linear Regression', 'KNN','Random Forest','SVR']}
baseline = pd.DataFrame(RMSE)
baseline = baseline.set_index('Model')
baseline.plot.bar()
