## Note
Trying out different models and paramaters to see which performs the best

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [261]:
# Loading the evaluation data
df = pd.read_csv("../csv_files/development.csv")

In [262]:
## Reducing the dataset to X percent of original size to speed up model testing
df = df.sample(frac=0.01)

In [263]:
## Filter out values outside 0.03 and 0.97 percentile 

# def filter_by_percentile(df, lower_quantile=0.03, upper_quantile=0.97):
#     for col in df.columns:
#         if col == "x" or col == "y":
#             continue
#         # Calculate the quantiles for each column
#         lower_bound = df[col].quantile(lower_quantile)
#         upper_bound = df[col].quantile(upper_quantile)

#         # Filter the DataFrame by the quantile range
#         df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

#     return df

# df = filter_by_percentile(df) ## Doesn't seem to work as is

# def limit_by_percentile(df, lower_quantile=0.03, upper_quantile=0.97):
#     for col in df.columns:
#         if col == "x" or col == "y":
#             continue

#         # Calculate the quantiles for each column
#         lower_bound = df[col].quantile(lower_quantile)
#         upper_bound = df[col].quantile(upper_quantile)

#         # Limit the DataFrame values by the quantile range
#         df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
#     return df

# df = limit_by_percentile(df)

In [264]:
# Extracting the positions, removes the x and y column and splitting the data into train and validation set
def extracting_positions(df):
    pos = []
    for i in range(len(df)):
        pos.append([df["x"].iloc[i], df["y"].iloc[i]])
    return pos

pos = extracting_positions(df)

## Dropping data from x and y 
df.drop(["x", "y"], axis=1, inplace=True)


In [265]:
## Removing pads with format: pads = ["0", "7", "12", ..]
def drop_pads(df, pads):
    cols_to_drop = [col for col in df.columns if any(idx in col for idx in pads)]
    df_removed = df.drop(cols_to_drop, axis=1)    
    return df_removed

remove_pads = ["0", "7", "12", "15", "16", "17"]
df_removed_noise = drop_pads(df, remove_pads)

In [266]:
## Removing rms feature
def drop_rms_features(df):
    # Extract columns that start with 'rms'
    rms_columns = [col for col in df.columns if not col.startswith('rms')]

    # Create a new DataFrame without 'rms' columns
    df_without_rms = df[rms_columns] 
    return df_without_rms

df_interesting_data = drop_rms_features(df_removed_noise)

In [267]:
## Splitting into train and validation set
X_train, X_val, pos_train, pos_val = train_test_split(df_interesting_data, pos, test_size=0.2, random_state=42)

In [268]:
x_train_data = []
y_train_data = []
for i in range(len(pos_train)):
    x_train_data.append(pos_train[i][0])
    y_train_data.append(pos_train[i][1])

## RandomForestRegressor

In [273]:
# RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import math
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

def avg_euc_dist(pos_val, pos_pred):
    sum_square = 0
    for i in range(len(pos_val)):
        sum_square += math.sqrt((pos_val[i][0]-pos_pred[i][0])**2 + (pos_val[i][1]-pos_pred[i][1])**2)
    return sum_square/len(pos_val) 

param_grid = {
    #'n_estimators': [25],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split':[2, 4, 6], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # Minimum number of samples required to be at a leaf node
    }

numb_trees = 25
# base_regressor = RandomForestRegressor(n_estimators=numb_trees, criterion="poisson") 
# mult_regr = MultiOutputRegressor(base_regressor)

mult_regr_x = RandomForestRegressor(n_estimators=numb_trees, criterion="poisson", random_state=42)
mult_regr_y = RandomForestRegressor(n_estimators=numb_trees, criterion="poisson", random_state=42)


grid_search_x = GridSearchCV(
    estimator=mult_regr_x,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1, 
    verbose=2)

grid_search_y = GridSearchCV(
    estimator=mult_regr_x,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1, 
    verbose=2)

# scaler = RobustScaler(quantile_range=(0.03, 0.97))

# pipeline = Pipeline([
#     # ('scaler', scaler),
#     ('regressor', mult_regr),
# ])

grid_search_x.fit(X_train, x_train_data)
grid_search_y.fit(X_train, y_train_data)

best_params_x = grid_search_x.best_params_
best_params_y = grid_search_y.best_params_

#best_model = grid_search.best_estimator_

#pos_pred = best_model.predict(X_val)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4; total time=   3.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=4; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=6; total time=   2.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=6; total time=   2.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   2.8s
[CV

In [274]:
print(best_params_x)
print(best_params_y)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [241]:
# Metrics to evaluating model 
import sklearn.metrics as sm
import math
import numpy as np

def avg_euc_dist(pos_val, pos_pred):
    sum_square = 0
    for i in range(len(pos_val)):
        sum_square += math.sqrt((pos_val[i][0]-pos_pred[i][0])**2 + (pos_val[i][1]-pos_pred[i][1])**2)
    return sum_square/len(pos_val)        

def metrics_on_model(pos_val, pos_pred):
    print("Mean absolute error =", round(sm.mean_absolute_error(pos_val, pos_pred), 2)) 
    print("Mean squared error =", round(sm.mean_squared_error(pos_val, pos_pred), 2)) 
    print("Median absolute error =", round(sm.median_absolute_error(pos_val, pos_pred), 2)) 
    print("Explain variance score =", round(sm.explained_variance_score(pos_val, pos_pred), 2)) 
    print("R2 score =", round(sm.r2_score(pos_val, pos_pred), 2))
    print("Mean eucledian distance =", round(avg_euc_dist(pos_val, pos_pred), 2))

metrics_on_model(pos_val, pos_pred)

Mean absolute error = 4.14
Mean squared error = 34.51
Median absolute error = 3.1
Explain variance score = 1.0
R2 score = 1.0
Mean eucledian distance = 6.58


## Results:

Test 1:  
    Data: reomving pads (0, 7, 12, 15, 16, 17)  
    Number of trees: 10   
    Mean euc dist: 5.77  

Test 2:  
    Data: reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature  
    Number of trees: 10  
    Mean euc dist: 5.69

Test 3:    
    Data: Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 10  
    Mean euc dist: 6.56  

Test 4:    
    Data: Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 30  
    Mean euc dist: 6.16  

Test 4:    
    Data: With PCA. Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 30  
    Mean euc dist: 10.29

Test 5:        
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature. Added RobustScaling.   
    Number of trees: 50  
    Mean euc dist: 5.31  

Test 6:  
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature. Added column: pmax[5]*area[5] 
    Number of trees: 30  
    Mean euc dist: 6.11

Test 7:   
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 25  
    Mean euc dist: 6.58  

 

## Using the multiple_reg_model on the evaluation set

In [217]:
ev_data = pd.read_csv("../csv_files/evaluation.csv")

In [218]:
# Extracting the ID
eval_id = ev_data["Id"]

# Dropping the Id column from the ev_data
ev_data.drop(["Id"], axis=1, inplace=True)

In [219]:
# Formatting the position array to a string to be used in the .csv file 
def pred_to_string(prediction_array):
    pred_column = []
    for i in range(len(prediction_array)):
        pos_string = (str(prediction_array[i][0]) + "|" + str(prediction_array[i][1]))
        pred_column.append(pos_string)
    return pred_column
        

In [223]:
# Predicting the evaluation results
mult_regr_eval = pipeline.predict(ev_data)
pos_pred = pred_to_string(mult_regr_eval) # Formatting the predictions 

In [224]:
# Creating a df and .csv file to be submitted. Saved in submission_file folder
mult_reg_submission = pd.DataFrame({'Id': eval_id, 'Predicted': pos_pred})
mult_reg_submission.to_csv("../DataScienceLab_Project/submission_files/mult_reg_rand_forest_with_added_feature.csv", index=False)