## Note
Trying out different models and paramaters to see which performs the best

In [150]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [151]:
# Loading the evaluation data
dev = pd.read_csv("../csv_files/development.csv")

In [152]:
## sum all areas to one col
def sum_area(dframe):
    area_columns = [col for col in dframe.columns if col.startswith('area')]
    sum_of_areas = dframe[area_columns].sum(axis=1)
    dframe['area_sum'] = sum_of_areas

In [153]:
# Extracting the positions and removes the x and y column.
import numpy as np
pos_dev = dev[["x", "y"]]

## Dropping data from x and y 
dev = dev.drop(["x", "y"], axis=1)

In [154]:
## Removing pads with format: pads = [0, 7, 12, ..]
def drop_pads(input_list, df):
    for i in input_list:
        columns_to_remove = df.filter(like=f'[{i}]').columns
        df = df.drop(columns=columns_to_remove)
    return df

remove_pads = [0, 7, 12, 15, 16, 17]
dev_removed_noise = drop_pads(remove_pads, dev)

In [155]:
## Removing rms feature
def drop_rms_features(df):
    # Extract columns that start with 'rms'
    rms_columns = [col for col in df.columns if not col.startswith('rms')]

    # Create a new DataFrame without 'rms' columns
    df_without_rms = df[rms_columns] 
    return df_without_rms

dev_interesting_data = drop_rms_features(dev_removed_noise)

In [156]:
## Removing tmax feature
def drop_tmax_features(df):
    # Extract columns that start with 'rms'
    tmax_columns = [col for col in df.columns if not col.startswith('tmax')]

    # Create a new DataFrame without 'rms' columns
    df_without_tmax = df[tmax_columns] 
    return df_without_tmax

dev_interesting_data = drop_tmax_features(dev_interesting_data)

In [63]:
## Z-transformation of the data. Remember to scale accordingly to training data for eval data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dev_interesting_data)
 
dev_interesting_data = pd.DataFrame(scaler.transform(dev_interesting_data), columns=dev_interesting_data.columns)

In [157]:
def quantile2(dframe, lw=0.05, up=0.95, drop=True):
    tresholds = {}
    for col_name in dframe.columns:
        lw_tresh = dframe[col_name].quantile(lw)
        up_tresh = dframe[col_name].quantile(up)
        tresholds[col_name] = [lw_tresh, up_tresh]
    print(f"tresholds for {lw}, {up}: {tresholds}")
    initial_dim = dframe.shape
    for col_name in dframe.columns:
        if drop:
            dframe.drop(dframe[dframe[col_name] < tresholds[col_name][0]].index, inplace=True)
            dframe.drop(dframe[dframe[col_name] > tresholds[col_name][1]].index, inplace=True)
        else:
            dframe.loc[dframe[col_name] < tresholds[col_name][0], col_name] = tresholds[col_name][0]
            dframe.loc[dframe[col_name] > tresholds[col_name][1], col_name] = tresholds[col_name][1]

    new_dim = dframe.shape
    print(f"""
          initial dim:   {initial_dim}
          new dim:       {new_dim}
          a reduction of {((initial_dim[0]-new_dim[0])/initial_dim[0])*100}% of rows
          """)
    

In [139]:
# ## Reducing the dataset to X percent of original size to speed up model testing
# dev_interesting_data_sample = dev_interesting_data.sample(frac=0.25)
#pos_dev_sample = pos_dev.loc[dev_interesting_data.index]

In [158]:
## Splitting into train and validation set
X_train, X_val, pos_train, pos_val = train_test_split(dev_interesting_data, pos_dev, test_size=0.001, random_state=42)

In [114]:
## drop the outliers
quantile2(X_train, 0.01, 0.99,drop=False)
pos_train = pos_train.loc[X_train.index]

tresholds for 0.01, 0.99: {'pmax[1]': [3.438953704833962, 78.60699914550783], 'negpmax[1]': [-46.69329519653318, -2.974553710937477], 'area[1]': [1.5165100158690865, 39.688086157226614], 'pmax[2]': [3.191000152587868, 44.261093963623104], 'negpmax[2]': [-23.78017808261275, -2.947936126708984], 'area[2]': [1.2261787353515805, 24.626779455566098], 'pmax[3]': [3.9980726928710486, 105.3254884033204], 'negpmax[3]': [-63.156826782226545, -3.0886613023651734], 'area[3]': [2.1880904113770163, 51.3920994689941], 'pmax[4]': [3.4978090026881783, 63.28823794555664], 'negpmax[4]': [-35.70557189941402, -3.0864365650258394], 'area[4]': [1.5376550354004723, 32.69773432617193], 'pmax[5]': [6.581597076416038, 111.8521913757324], 'negpmax[5]': [-65.75915573120119, -3.5292998046875446], 'area[5]': [4.538289691162173, 54.45958193359374], 'pmax[6]': [3.430690277099609, 84.86641152954105], 'negpmax[6]': [-53.70983749389651, -3.0848340213749554], 'area[6]': [1.46019553833006, 42.110686480712936], 'pmax[8]': [

## RandomForestRegressor

In [168]:
# RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import math
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

numb_trees = 50
base_regressor = RandomForestRegressor(n_estimators=numb_trees, criterion="poisson", max_depth=30, max_features=0.3, bootstrap=True, verbose=1) 
mult_regr = MultiOutputRegressor(base_regressor)

mult_regr.fit(X_train, pos_train)

pos_pred = mult_regr.predict(X_val)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  8.5min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  8.6min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s


## Note on GridSearchCV
By splitting the problem into a regression problem for x-coordinate and y-coordinate I used GridSearchCV with these paramaters:
   
    param_grid = {  
        'max_depth': [20, 30, 40],  # Maximum depth of the tree
        'min_samples_split':[2, 4, 6], # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4], # Minimum number of samples required to be at a leaf node
        'max_features': ["sqrt", "log2", None]
        }  
         
The best paramaters for x-coordinate and y-coordinate were: 
  
{'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}  
{'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 4}  

 
 This GridSearch is run on only 1% of the dataset in order to speed up the process.



In [169]:
# Metrics to evaluating model 
import sklearn.metrics as sm
import math
import numpy as np

def avg_euc_dist(pos_val, pos_pred):
    sum_square = 0
    for i in range(len(pos_val)):
        sum_square += math.sqrt((pos_val[i][0]-pos_pred[i][0])**2 + (pos_val[i][1]-pos_pred[i][1])**2)
    return sum_square/len(pos_val)    

def metrics_on_model(pos_val, pos_pred):
    print("Mean absolute error =", round(sm.mean_absolute_error(pos_val, pos_pred), 2)) 
    print("Mean squared error =", round(sm.mean_squared_error(pos_val, pos_pred), 2)) 
    print("Median absolute error =", round(sm.median_absolute_error(pos_val, pos_pred), 2)) 
    print("Explain variance score =", round(sm.explained_variance_score(pos_val, pos_pred), 2)) 
    print("R2 score =", round(sm.r2_score(pos_val, pos_pred), 2))
    print("Mean eucledian distance =", round(avg_euc_dist(pos_val, pos_pred), 2))

metrics_on_model(pos_val.to_numpy(), pos_pred)

Mean absolute error = 2.7
Mean squared error = 12.96
Median absolute error = 2.19
Explain variance score = 1.0
R2 score = 1.0
Mean eucledian distance = 4.27


## Results:

Test 1:  
    Data: reomving pads (0, 7, 12, 15, 16, 17)  
    Number of trees: 10   
    Mean euc dist: 5.77  

Test 2:  
    Data: reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature  
    Number of trees: 10  
    Mean euc dist: 5.69

Test 3:    
    Data: Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 10  
    Mean euc dist: 6.56  

Test 4:    
    Data: Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 30  
    Mean euc dist: 6.16  

Test 4:    
    Data: With PCA. Sampeled 25% of the data to speed up model training. Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 30  
    Mean euc dist: 10.29

Test 5:        
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature. Added RobustScaling.   
    Number of trees: 50  
    Mean euc dist: 5.31  

Test 6:  
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature. Added column: pmax[5]*area[5] 
    Number of trees: 30  
    Mean euc dist: 6.11

Test 7:   
    Data: Reomving pads (0, 7, 12, 15, 16, 17) and removing rms feature.  
    Number of trees: 25  
    Mean euc dist: 6.58  

 

## Using the multiple_reg_model on the evaluation set

In [170]:
ev_data = pd.read_csv("../csv_files/evaluation.csv")

In [171]:
# Extracting the ID
eval_id = ev_data["Id"]

# Dropping the Id column from the ev_data
ev_data = ev_data.drop(["Id"], axis=1)

In [172]:
# Formatting the position array to a string to be used in the .csv file 
def pred_to_string(prediction_array):
    pred_column = []
    for i in range(len(prediction_array)):
        pos_string = (str(prediction_array[i][0]) + "|" + str(prediction_array[i][1]))
        pred_column.append(pos_string)
    return pred_column
        

In [173]:
## Preprocessing 
remove_pads = [0, 7, 12, 15, 16, 17]
ev_data = drop_pads(remove_pads, ev_data) # Remove pads
ev_data = drop_rms_features(ev_data) # Remove rms feature 
ev_data = drop_tmax_features(ev_data)
#sum_area(ev_data)
#ev_data = pd.DataFrame(scaler.transform(ev_data), columns=ev_data.columns) # Z-transform with mean and std from training data

In [174]:
# Predicting the evaluation results
mult_regr_eval = mult_regr.predict(ev_data)
pos_pred = pred_to_string(mult_regr_eval) # Formatting the predictions 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    4.2s


In [175]:
# Creating a df and .csv file to be submitted. Saved in submission_file folder
mult_reg_submission = pd.DataFrame({'Id': eval_id, 'Predicted': pos_pred})
mult_reg_submission.to_csv("../DataScienceLab_Project/submission_files/vanilla_all_n50.csv", index=False)