## Note
Trying out different models and paramaters to see which performs the best

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Loading the evaluation data
df = pd.read_csv("../csv_files/development.csv")

In [3]:
# Extracting the positions, removes the x and y column and splitting the data into train and validation set
def extraxting_positions(df):
    pos = []
    for i in range(len(df)):
        pos.append([df["x"].iloc[i], df["y"].iloc[i]])
    return pos

pos = extraxting_positions(df)

## Dropping data from x and y 
df.drop(["x", "y"], axis=1, inplace=True)


In [4]:
## Removing pads with format: pads = ["0", "7", "12", ..]
def drop_pads(df, pads):
    cols_to_drop = [col for col in df.columns if any(idx in col for idx in pads)]
    df_removed = df.drop(cols_to_drop, axis=1)    
    return df_removed

remove_pads = ["0", "7", "12", "15", "16", "17"]
df_removed_noise = drop_pads(df, remove_pads)

In [5]:
## Splitting into train and validation set
X_train, X_val, pos_train, pos_val = train_test_split(df_removed_noise, pos, test_size=0.2, random_state=42)

In [6]:
# Using sklearn multioutputregressor 

mult_regr = MultiOutputRegressor(LinearRegression())
mult_regr.fit(X_train, pos_train)
pos_pred = mult_regr.predict(X_val)

In [7]:
# Metrics to evaluating model 
import sklearn.metrics as sm

def metrics_on_model(pos_val, pos_pred):
    print("Mean absolute error =", round(sm.mean_absolute_error(pos_val, pos_pred), 2)) 
    print("Mean squared error =", round(sm.mean_squared_error(pos_val, pos_pred), 2)) 
    print("Median absolute error =", round(sm.median_absolute_error(pos_val, pos_pred), 2)) 
    print("Explain variance score =", round(sm.explained_variance_score(pos_val, pos_pred), 2)) 
    print("R2 score =", round(sm.r2_score(pos_val, pos_pred), 2))
    #print("Mean eucledian distance =", round(sm.euclidean_distances(pos_val, pos_pred), 2))

metrics_on_model(pos_val, pos_pred)

Mean absolute error = 12.32
Mean squared error = 300.38
Median absolute error = 8.82
Explain variance score = 0.98
R2 score = 0.98


## Using the multiple_reg_model on the evaluation set

In [8]:
ev_data = pd.read_csv("../csv_files/evaluation.csv")

In [9]:
# Extracting the ID
eval_id = ev_data["Id"]

# Dropping the Id column from the ev_data
ev_data.drop(["Id"], axis=1, inplace=True)

In [10]:
# Formatting the position array to a string to be used in the .csv file 
def pred_to_string(prediction_array):
    pred_column = []
    for i in range(len(prediction_array)):
        pos_string = (str(prediction_array[i][0]) + "|" + str(prediction_array[i][1]))
        pred_column.append(pos_string)
    return pred_column
        

In [12]:
# Predicting the evaluation results
mult_regr_eval = mult_regr.predict(ev_data)
pos_pred = pred_to_string(mult_regr_eval) # Formatting the predictions 

In [None]:
# Creating a df and .csv file to be submitted. Saved in submission_file folder
mult_reg_submission = pd.DataFrame({'Id': eval_id, 'Predicted': pos_pred})
mult_reg_submission.to_csv("../DataScienceLab_Project/submission_files/mult_reg_first_try.csv", index=False)

## Gradient Boost

In [15]:
GB = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=30))

# instantiate multi output model
GBR = GradientBoostingRegressor(n_estimators=100)
model = MultiOutputRegressor(GBR)

# Train, test and eval
model.fit(X_train, pos_train)
pred = model.predict(X_val)
mse = mean_squared_error(pos_val, pred)
print(f'MSE: {mse}')


KeyboardInterrupt: 

In [None]:
# make submission
# Predicting the evaluation results
sub_pred = mult_regr.predict(ev_data)
pos_pred = pred_to_string(sub_pred) # Formatting the predictions 
GBR_submission = pd.DataFrame({'Id': eval_id, 'Predicted': pos_pred})
GBR_submission.to_csv("../DataScienceLab_Project/submission_files/GBR.csv", index=False)