# @author : ODD team
## @ randomforest version
  
#### You cau use this form about training.

In [None]:
##Load module
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import time
import torch

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import make_scorer
#define your own mse and set greater_is_better=False
mse = make_scorer(mean_squared_error,greater_is_better=False)

In [None]:
os.makedirs('./weights', exist_ok=True)

##Dataset

In [None]:
df_train = pd.read_csv('../datasets/iou1_train.csv')
df_valid = pd.read_csv('../datasets/iou1_valid.csv')
df_test = pd.read_csv('../datasets/iou1_test.csv')

In [None]:
df_train['class'].unique()

In [None]:
df_train.info()

In [None]:
#Remove non-predict variable
train = df_train.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_x', 'depth_y'], axis=1)
valid = df_valid.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_x', 'depth_y'], axis=1)
test = df_test.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_x', 'depth_y'], axis=1)

In [None]:
#Data split
X_train = train.loc[:, train.columns != 'zloc']
y_train = train.loc[:, train.columns == 'zloc']

X_valid = valid.loc[:, valid.columns != 'zloc']
y_valid = valid.loc[:, valid.columns == 'zloc']

X_test = test.loc[:, test.columns != 'zloc']
y_test = test.loc[:, test.columns == 'zloc']

In [None]:
scalerX = StandardScaler().fit(X_train)

In [None]:
X_train_scale = scalerX.transform(X_train)
X_valid_scale = scalerX.transform(X_valid)
X_test_scale = scalerX.transform(X_test)

In [None]:
# onehot encoding
class_dummy = pd.get_dummies(df_train['class'])
X_train = pd.concat([pd.DataFrame(X_train_scale), class_dummy], axis=1)
class_dummy = pd.get_dummies(df_valid['class'])
X_valid = pd.concat([pd.DataFrame(X_valid_scale), class_dummy], axis=1)
class_dummy = pd.get_dummies(df_test['class'])
X_test = pd.concat([pd.DataFrame(X_test_scale), class_dummy], axis=1)

In [None]:
X_train

##RandomForest

In [None]:
model = RandomForestRegressor(random_state=1, 
                                n_estimators=500, 
                                max_depth=20,
                                min_samples_split=2, 
                                max_features=2, 
                                warm_start = True)
model.fit(X_train, y_train)

In [None]:
def accuracy(pred, actual):
  df = pd.DataFrame()
  df['pred'] = pred
  df['actual'] = actual
  df['distance_g'] = actual // 10

  df['accuracy'] = abs(df['pred'] - df['actual']) / df['actual']

  return df

In [None]:
def evaluate_metric(model):
  train_pred = model.predict(X_train)
  valid_pred = model.predict(X_valid)
  test_pred = model.predict(X_test)

  train_rmse = mean_squared_error(y_train, train_pred) ** 0.5
  train_mae = mean_absolute_error(y_train, train_pred)
  train_acc = accuracy(train_pred, y_train)
  train_accuracy = np.mean(train_acc['accuracy'])

  val_rmse = mean_squared_error(y_valid, valid_pred) ** 0.5
  val_mae = mean_absolute_error(y_valid, valid_pred)
  val_acc = accuracy(valid_pred, y_valid)
  val_accuracy = np.mean(val_acc['accuracy'])

  test_rmse = mean_squared_error(y_test, test_pred) ** 0.5
  test_mae = mean_absolute_error(y_test, test_pred)
  test_acc = accuracy(test_pred, y_test)
  test_accuracy = np.mean(test_acc['accuracy'])

  print("Train - RMSE: {:6.5} / MAE: {:6.5} / Accuracy: {:6.5}".format(train_rmse, train_mae, 1-train_accuracy))
  print("Valid - RMSE: {:6.5} / MAE: {:6.5} / Accuracy: {:6.5}".format(val_rmse, val_mae, 1-val_accuracy))
  print("Test  - RMSE: {:6.5} / MAE: {:6.5} / Accuracy: {:6.5}".format(test_rmse, test_mae, 1-test_accuracy))

  return train_acc,test_acc

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

train_acc, test_acc = evaluate_metric(model)

In [None]:
train_acc.head()

In [None]:
test_acc.head()

In [None]:
test_err_rate = test_acc[['accuracy', 'distance_g']].groupby(['distance_g']).mean()

In [None]:
test_err_rate

In [None]:
test_acc[['accuracy', 'distance_g']].groupby(['distance_g']).count()

In [None]:
for i in range(len(test_err_rate.values)):
  print('Distance Interval: {0} ~ {1} - Accuracy: {2}'.format(i*10, (i+1)*10, 1-test_err_rate.accuracy[i]))

###RandomForest Tuning

In [None]:
n_estimators = [50, 100, 200, 300, 500]
max_depth = range(5,25,5)
min_samples_split = [2, 5, 10, 20]
max_features = range(2,12,2)
params = [n_estimators, max_depth, min_samples_split, max_features]

In [None]:
import warnings
warnings.filterwarnings(action='ignore')
import itertools
from tqdm import tqdm 


best_rmse = np.inf
best_mae = np.inf
rmse_best_param = pd.DataFrame()
mae_best_param = pd.DataFrame()

res_df = pd.DataFrame(columns = ['train_mae','valid_mae','train_rmse', 'valid_rmse','n_estimators', 'max_depth', 'min_samples', 'max_features'])

param_list = list(itertools.product(*params))
for n, d, s, f in tqdm(param_list):
  model = RandomForestRegressor(random_state=1, 
                                n_estimators=n, 
                                max_depth=d,
                                min_samples_split=s, 
                                max_features=f, 
                                warm_start = True)
  model.fit(X_train, y_train)

  train_pred = model.predict(X_train)
  valid_pred = model.predict(X_valid)

  train_mse = mean_squared_error(y_train, train_pred)
  train_rmse = train_mse ** 0.5
  train_mae = mean_absolute_error(y_train, train_pred)
  val_mse = mean_squared_error(y_valid, valid_pred)
  val_rmse = val_mse ** 0.5
  val_mae = mean_absolute_error(y_valid, valid_pred)

  res = pd.DataFrame([{'train_mae':train_mae, 'valid_mae': val_mae, 'train_rmse' : train_rmse, 'valid_rmse': val_rmse,
                       'n_estimators':n, 'max_depth':d, 'min_samples':s, 'max_features':f}])
  res_df = pd.concat([res_df, res])

  if best_rmse > val_rmse:
    rmse_best_param = res
    best_rmse = val_rmse
  if best_mae > val_mae:
    mae_best_param = res
    best_mae = val_mae
  print("Train_RMSE : {0}, Valid_RMSE: {1}, Train_MAE: {2}, Valid_MAE: {3}, n_estimators:{4}, max_depth: {5}, min_samples: {6}, max_features: {7}".format(train_rmse, val_rmse, train_mae, val_mae, n, d, s, f))

In [None]:
print(rmse_best_param)

In [None]:
print(mae_best_param)

In [None]:
res_df.to_csv("odd/weights/RandomForest/RandomForset_Tuning22.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.scatter(pred, y_test, c='crimson')

p1 = max(max(res_df.pred_scale), max(res_df.actual_scale))
p2 = min(min(res_df.pred_scale), min(res_df.actual_scale))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

In [None]:
df_train = pd.read_csv('datasets/iou1_train.csv')
df_valid = pd.read_csv('datasets/iou1_valid.csv')
df_test = pd.read_csv('datasets/iou1_test.csv')

In [None]:
train = df_train.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_max', 'depth_median'], axis=1)
valid = df_valid.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_max', 'depth_median'], axis=1)
test = df_test.drop(['filename', 'class', 'weather', 'angle', 'depth_min', 'depth_max', 'depth_median'], axis=1)

In [None]:
X_train = train.loc[:, train.columns != 'zloc']
y_train = train.loc[:, train.columns == 'zloc']

X_valid = valid.loc[:, valid.columns != 'zloc']
y_valid = valid.loc[:, valid.columns == 'zloc']

X_test = test.loc[:, test.columns != 'zloc']
y_test = test.loc[:, test.columns == 'zloc']

In [None]:
from sklearn.preprocessing import StandardScaler
scalerX = StandardScaler().fit(X_train)
#scalery = StandardScaler().fit(y_train)

In [None]:
X_train_scale = scalerX.transform(X_train)
X_valid_scale = scalerX.transform(X_valid)
X_test_scale = scalerX.transform(X_test)

In [None]:
# onehot encoding
class_dummy = pd.get_dummies(df_train['class'])
X_train = pd.concat([pd.DataFrame(X_train_scale), class_dummy], axis=1)
class_dummy = pd.get_dummies(df_valid['class'])
X_valid = pd.concat([pd.DataFrame(X_valid_scale), class_dummy], axis=1)
class_dummy = pd.get_dummies(df_test['class'])
X_test = pd.concat([pd.DataFrame(X_test_scale), class_dummy], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scalerX = StandardScaler().fit(X_train)
#scalery = StandardScaler().fit(y_train)

In [None]:
X_train = scalerX.transform(X_train)
X_valid = scalerX.transform(X_valid)
X_test = scalerX.transform(X_test)

In [None]:
X_train