In [None]:
import sys, os
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import svm, model_selection
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

## Load dataset and split test and train datasets

In [None]:
data_dir = './'
file_name = 'dataset_10bins.csv'

data = pd.read_csv(data_dir + file_name)
data = data.fillna(0)

features = data[['Rt_AC_250', 'Rt_AC_500', 'Rt_AC_1000', 'Rt_AC_2000', 'Rt_AC_3000', 'Rt_AC_4000', 'Rt_BC_250', 'Rt_BC_500', 'Rt_BC_1000', 'Rt_BC_2000', 'Rt_BC_4000']]
labels   = data[['Rt_SDS']]
data     = shuffle(data)
data_len = len(labels)

temp_data  = data[['Rt_AC_250', 'Rt_AC_500', 'Rt_AC_1000', 'Rt_AC_2000', 'Rt_AC_4000', 'Rt_BC_250', 'Rt_BC_500', 'Rt_BC_1000', 'Rt_BC_2000', 'Rt_BC_4000']]
temp_label = data[['Rt_SDS']]

X_train       = temp_data[:int(data_len*0.8)]
Y_train_label = temp_label[:int(data_len*0.8)]
X_test        = temp_data[int(data_len*0.8):]
Y_test_label  = temp_label[int(data_len*0.8):]

## Encode labels and apply standardization

In [None]:
# Dimension of Train and Test set 
print("Dimension of Train set: ", X_train.shape)
print("Dimension of Test set : ", X_test.shape,"\n")

# Transforming non numerical labels into numerical labels
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

for label in np.unique(X_test):
    if label not in encoder.classes_: 
        encoder.classes_ = np.append(encoder.classes_, label) 

#Total Number of Continous and Categorical features in the training set
num_cols = X_train._get_numeric_data().columns
print("Number of numeric features: ", num_cols.size)

names_of_predictors = list(X_train.columns.values)

# Scaling the Train and Test feature set 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train and find a best-effor RF model using parameters grid search

In [None]:
params_grid = [{'n_estimators': [400], 'n_jobs': [4], 'criterion': ['gini', 'entropy']},
               {'n_estimators': [500], 'n_jobs': [4], 'criterion': ['gini', 'entropy']},
               {'n_estimators': [600], 'n_jobs': [4], 'criterion': ['gini', 'entropy']}]

rf = RandomForestClassifier()
rf_model = GridSearchCV(rf, params_grid, cv=10)
rf_model.fit(X_train_scaled, Y_train)

## Best RF model's configuration

In [None]:
print('Best score for training data: ', rf_model.best_score_,"\n") 
print('Best N estimators           : ',rf_model.best_estimator_.n_estimators,"\n") 
print('Best Criterion              : ',rf_model.best_estimator_.criterion,"\n")

final_model  = rf_model.best_estimator_
Y_pred       = final_model.predict(X_test_scaled)
Y_pred_label = list(encoder.inverse_transform(Y_pred))

Y_test_label_2   = np.array(Y_test_label, np.float)
abs_Y_test_label = Y_test_label_2
Y_test_label_2   = list(Y_test_label_2.reshape(-1))
Y_pred_label_2   = list(map(int, Y_pred_label))
Y_test_label_3   = list(map(int, Y_test_label_2))

for idx in range(0, len(Y_pred_label_2)):
    if Y_pred_label_2[idx] >= 90 and Y_pred_label_2[idx] < 100:
        Y_pred_label_2[idx] = 90
    elif Y_pred_label_2[idx] >= 80 and Y_pred_label_2[idx] < 90:
        Y_pred_label_2[idx] = 80
    elif Y_pred_label_2[idx] >= 70 and Y_pred_label_2[idx] < 80:
        Y_pred_label_2[idx] = 70
    elif Y_pred_label_2[idx] >= 60 and Y_pred_label_2[idx] < 70:
        Y_pred_label_2[idx] = 60
    elif Y_pred_label_2[idx] >= 50 and Y_pred_label_2[idx] < 60:
        Y_pred_label_2[idx] = 50
    elif Y_pred_label_2[idx] >= 40 and Y_pred_label_2[idx] < 50:
        Y_pred_label_2[idx] = 40
    elif Y_pred_label_2[idx] >= 30 and Y_pred_label_2[idx] < 40:
        Y_pred_label_2[idx] = 30
    elif Y_pred_label_2[idx] >= 20 and Y_pred_label_2[idx] < 30:
        Y_pred_label_2[idx] = 20
    elif Y_pred_label_2[idx] >= 10 and Y_pred_label_2[idx] < 20:
        Y_pred_label_2[idx] = 10
    elif Y_pred_label_2[idx] >= 0 and Y_pred_label_2[idx] < 10:
        Y_pred_label_2[idx] = 0

## Result - accuracy with confusion matrix

In [None]:
from sklearn import metrics

conf_matrix = confusion_matrix(Y_test_label_3,Y_pred_label_2)
print(conf_matrix)
print(classification_report(Y_test_label_3,Y_pred_label_2))

print("Training set score for RF: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for RF: %f" % final_model.score(X_test_scaled  , Y_test ))

print(len(Y_pred_label_2))
print(np.trace(conf_matrix))
print('[Manual]  calculate accuracy: ', np.trace(conf_matrix)/len(Y_pred_label_2))
print('[Library] calculate accuracy: ', metrics.accuracy_score(Y_test_label_3,Y_pred_label_2))