# Random Forest Without Hyperparameter Tuning

In [16]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn import metrics 
import csv
import pickle as pk

column_map_file_path = "../parameters/column_map.txt"
categorical_column_map_file_path = "../parameters/categorical_column_map.txt"


def train_test_split(item):
    # training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    # y_train=training.filter(regex=item)
    # X_train=training.drop(y_train, axis=1)
    # test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    # y_test=test.filter(regex=item)
    # X_test=test.drop(y_test, axis=1)
    y_train=pd.read_csv('../data/test_training_data/'+item+'/y_train.csv',sep=':')
    X_train=pd.read_csv('../data/test_training_data/'+item+'/x_train.csv',sep=':')
    y_test=pd.read_csv('../data/test_training_data/'+item+'/y_test.csv',sep=':')
    X_test=pd.read_csv('../data/test_training_data/'+item+'/x_test.csv',sep=':')
    return X_train,X_test, y_train,y_test


def print_save_metrics(y_test,y_pred,item):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    with open('../data/random_forest/RandomForest_metrics_'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)

def init_column_maps():
    column_maps = []
    with open(column_map_file_path) as f:
        parameters = f.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = tuple(p.split(":"))
            column_maps.append(p)
    with open(categorical_column_map_file_path) as cf:
        parameters = cf.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = p.split(":")
            p[0] = p[0].replace(" ", "_")
            p = tuple(p)
            column_maps.append(p)
    return column_maps

def feature_importance(important_list):
    lst_column_map=init_column_maps()
    lst_final=[]
    # important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
    for import_item in important_list:
        import_item_rem=import_item[1].replace('_imputed','')
        for column_item in lst_column_map:
            if import_item_rem == column_item[0]:
                import_renamed=column_item[2]
                l = list(import_item)
                l[1] = import_renamed
                lst_final.append(tuple(l))
                break
    return lst_final
def random_forest():
    output_variable= ['School_Code','OP1','OP2','OP6','OP3','OP4','OP5','OP7','OP8','OP9','OP10','OP11','OP12','OP13','OP14']
    output_variable.remove('School_Code')
    for item in output_variable:
        X_train,X_test, y_train,y_test=train_test_split(item)
        regressor = RandomForestRegressor()
        regressor.fit(X_train,y_train)
        y_pred=regressor.predict(X_test)
        with open('../data/random_forest/'+item+'.pkl', 'wb') as pickle_file:
            pk.dump(y_pred, pickle_file)
        
        important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
        import_lst=feature_importance(important_list)
        file = open('../data/random_forest/feature_importance_'+item+'.csv', 'w+', newline ='')
        with file:    
            write = csv.writer(file)
            write.writerows(import_lst)
        print_save_metrics(y_test,y_pred,item)
random_forest() 

  regressor.fit(X_train,y_train)
r2_score:OP1 0.6382597378823351
Mean Absolute Error (MAE):OP1 9.63016326530612
Mean Squared Error (MSE):OP1 191.41322199999996
Root Mean Squared Error (RMSE):OP1 13.835216731226149
  regressor.fit(X_train,y_train)
r2_score:OP2 0.682614887004803
Mean Absolute Error (MAE):OP2 7.291624999999997
Mean Squared Error (MSE):OP2 86.40520831249987
Root Mean Squared Error (RMSE):OP2 9.295440189280972
  regressor.fit(X_train,y_train)
r2_score:OP6 0.516449434018891
Mean Absolute Error (MAE):OP6 12.29190625
Mean Squared Error (MSE):OP6 229.05767171875004
Root Mean Squared Error (RMSE):OP6 15.134651357687433
  regressor.fit(X_train,y_train)
r2_score:OP3 0.8217375985419227
Mean Absolute Error (MAE):OP3 18.349062500000002
Mean Squared Error (MSE):OP3 557.8046406250003
Root Mean Squared Error (RMSE):OP3 23.617888149133915
  regressor.fit(X_train,y_train)
r2_score:OP4 0.7792398505525356
Mean Absolute Error (MAE):OP4 22.787647058823534
Mean Squared Error (MSE):OP4 883.1874

# Random Forest Hyperparameter Tuning

In [79]:
import numpy as np
import pandas as pd
from sklearn import metrics 
import csv
import pickle as pk
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
def print_save_metrics_tuned_manually(y_test,y_pred,item):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    with open('../data/random_forest/RandomForest_metrics_tuned_'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)
def init_column_maps():
    column_maps = []
    with open(column_map_file_path) as f:
        parameters = f.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = tuple(p.split(":"))
            column_maps.append(p)
    with open(categorical_column_map_file_path) as cf:
        parameters = cf.readlines()
        for p in parameters:
            p = p.replace("\n", "")
            p = p.split(":")
            p[0] = p[0].replace(" ", "_")
            p = tuple(p)
            column_maps.append(p)
    return column_maps

def print_save_regressor_params(regressor, technique, ov):
    with open(f"../results/{technique}/regressor_params_{ov}.csv", 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["parameter", "value"])
        for param, value in regressor.get_params(deep=True).items():
            writer.writerow([param, value])

def feature_importance(important_list):
    lst_column_map=init_column_maps()
    lst_final=[]
    # important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
    for import_item in important_list:
        import_item_rem=import_item[1].replace('_imputed','')
        for column_item in lst_column_map:
            if import_item_rem == column_item[0]:
                import_renamed=column_item[2]
                l = list(import_item)
                l[1] = import_renamed
                lst_final.append(tuple(l))
                break
    return lst_final
def train_test_split(item):
    # training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    # y_train=training.filter(regex=item)
    # X_train=training.drop(y_train, axis=1)
    # test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    # y_test=test.filter(regex=item)
    # X_test=test.drop(y_test, axis=1)
    y_train=pd.read_csv('../data/test_training_data/'+item+'/y_train.csv',sep=':')
    X_train=pd.read_csv('../data/test_training_data/'+item+'/x_train.csv',sep=':')
    y_test=pd.read_csv('../data/test_training_data/'+item+'/y_test.csv',sep=':')
    X_test=pd.read_csv('../data/test_training_data/'+item+'/x_test.csv',sep=':')
    return X_train,X_test, y_train,y_test

n_estimators=[int(x) for x in np.linspace(start = 80, stop = 200, num = 5)]
max_depth=[int(x) for x in np.linspace(5, 15, num = 3)]
min_samples_split=[1,5,10]
min_samples_leaf=[1,5,10]
param_grid={
            'n_estimators' : n_estimators,
            # 'max_depth' : max_depth
            'min_samples_leaf': min_samples_leaf
            }
# print(param_grid)
item='OP10'
X_train,X_test, y_train,y_test=train_test_split(item)
model = RandomForestRegressor()
rf_Grid=GridSearchCV(estimator=model,param_grid=param_grid,verbose=2,cv=3,n_jobs=1)
rf_Grid.fit(X_train,y_train)
print(rf_Grid.best_params_)
rf_regressor=rf_Grid.best_estimator_
print_save_regressor_params(rf_regressor, 'rf_baseline', item)
y_predict=rf_regressor.predict(X_test)
# r2_score=metrics.r2_score(y_test,y_predict)
with open('../data/random_forest/tuned'+item+'.pkl', 'wb') as pickle_file:
    pk.dump(y_predict, pickle_file)
important_list=sorted(list(zip(rf_regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
import_lst=feature_importance(important_list)
file = open('../data/random_forest/feature_importance_tuned'+item+'.csv', 'w+', newline ='')
with file:    
    write = csv.writer(file)
    write.writerows(import_lst)
print_save_metrics_tuned_manually(y_test,y_predict,item)

Please change the shape of y to (n_samples,), for example using ravel().
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   0.8s
[CV] min_samples_leaf=1, n_estimators=80 .............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   0.8s
[CV] min_samples_leaf=1, n_estimators=80 .............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] .............. min_samples_leaf=1, n_estimators=80, total=   1.1s
[CV] min_samples_leaf=1, n_estimators=110 ............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] ............. min_samples_leaf=1, n_estimators=110, total=   1.0s
[CV] min_samples_leaf=1, n_estimators=110 ............................
  estimator.fit(X_train, y_train, **fit_params)
[CV] ............. min_samples_leaf=1, 

# Random Forest With PCA

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle as pk
import csv
import pandas as pd

def train_test_split_PCA(item):
    
    # # item=item.replace('%','')
    # # item=item.replace(' ','')
    # # item=item.strip()
    # training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv')
    # # training=training.drop(['Town'], axis=1)
    # y_train=training.filter(regex=item)
    # X_train=training.drop(y_train, axis=1)
    # test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv')
    # # test=test.drop(['Town'], axis=1)
    # y_test=test.filter(regex=item)
    # X_test=test.drop(y_test, axis=1)
    # return X_train,X_test, y_train,y_test 
    training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    y_train=training.filter(regex=item)
    X_train=training.drop(y_train, axis=1)
    test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    y_test=test.filter(regex=item)
    X_test=test.drop(y_test, axis=1)
    return X_train,X_test, y_train,y_test



def print_save_metrics(y_test,y_pred):
    r2_score=metrics.r2_score(y_test,y_pred)
    MAE=metrics.mean_absolute_error(y_test, y_pred)
    MSE=metrics.mean_squared_error(y_test, y_pred)
    RMSE=np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    with open('../data/random_forest/RandomForest_metrics_PCA_'+item+'.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["r2_score","Mean Absolute Error (MAE)",'Mean Squared Error (MSE)','Root Mean Squared Error (RMSE)'])
        writer.writerow([r2_score,MAE,MSE,RMSE])
        

    print("r2_score:"+item,r2_score)
    print('Mean Absolute Error (MAE):'+item, MAE)
    print('Mean Squared Error (MSE):'+item, MSE)
    print('Root Mean Squared Error (RMSE):'+item, RMSE)

# output=output_variable.drop(['School Code'], axis=1)
output_variable= ['School_Code','OP1','OP2','OP6','OP3','OP4','OP5','OP7','OP8','OP9','OP10','OP11','OP12','OP13','OP14']
output_variable.remove('School_Code')
for item in output_variable:
    X_train,X_test, y_train,y_test=train_test_split_PCA(item)
    processed_input=pd.read_csv('../data/PCA/pca_'+item+'.csv')
    with open('../data/PCA/pca_'+item+'.pkl', 'rb') as pickle_file:
        pca = pk.load(pickle_file)
        scaled_data_train = pca.transform(processed_input)
        X_train = pd.DataFrame(data = scaled_data_train)
        # print(X_test)
        scaled_data_test = pca.transform(X_test)
        X_test = pd.DataFrame(data = scaled_data_test)

        regressor = RandomForestRegressor(n_estimators=100)
        regressor.fit(X_train,y_train)
        y_pred=regressor.predict(X_test)
        with open('../data/random_forest/PCA/PCA_'+item+'.pkl', 'wb') as pickle_file:
            pk.dump(y_pred, pickle_file)
        important_list=sorted(list(zip(regressor.feature_importances_,X_test.columns)),key =lambda x: x[0] ,reverse=True)[:10]
        file = open('../data/random_forest/PCA/feature_importance_PCA_'+item+'.csv', 'w+', newline ='')
        with file:    
            write = csv.writer(file)
            write.writerows(important_list)
        print_save_metrics(y_test,y_pred)
        

3                     1.2                 439                 430   
24                    14.8                  47                  53   
25                     0.4                 315                 284   
26                     0.0                 104                  95   
27                     0.4                 359                 325   
28                     0.5                  81                  40   
29                     1.1                 234                 220   
30                     0.7                   9                  15   
31                     0.1                   0                   0   
32                     0.0                 121                 142   
33                     4.7                 255                 255   
34                     0.8                 312                 308   
35                     1.5                 284                 303   
36                     0.1                  27                  22   
37                   

ValueError: operands could not be broadcast together with shapes (49,537) (176,) 

In [None]:
def train_test_split(item):
    training=pd.read_csv('../data/test_training_data/'+item+'/final_training_data.csv',sep=':')
    y_train=training.filter(regex=item)
    X_train=training.drop(y_train, axis=1)
    test=pd.read_csv('../data/test_training_data/'+item+'/final_test_data.csv',sep=':')
    y_test=test.filter(regex=item)
    X_test=test.drop(y_test, axis=1)
    return X_train,X_test, y_train,y_test