In [2]:
import pandas as pd
import numpy as np

from INL1.train_test_split import engineer_features
from train_test_split import split_data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import naivebayes
import metrics

original_df = pd.read_csv("./FIFA18_players_database/CompleteDataset.csv", low_memory=False)
original_df = engineer_features(original_df)
original_df = original_df.sample(frac=1)
X_train, X_test, y_train, y_test = split_data(original_df, 0)
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)
X_train_standard = scaler_standard.fit_transform(X_train)
X_test_standard = scaler_standard.fit_transform(X_test)

print("Running models with outliers present: ")

# Run gaussian naive bayes
y_pred, model = naivebayes.run_naive_bayes(X_train, X_test, y_train)
y_pred_minmax, model = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train)
y_pred_standard, model = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train)
acc = metrics.accuracy_scorer(y_test, y_pred)
acc_minmax = metrics.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = metrics.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}\n")


# Remove outliers
train_set = np.concatenate([X_train, y_train[:,None]], axis=1) # Add y_train array as last column of X_train 2d array
outlier_free_df = pd.DataFrame(train_set)                                   # Create pandas dataframe of features and labels
length = len(train_set[0]) - 1
for i in range(length):                                        # For each feature in the set, remove each row that has 
    outlier_free_df[i] = outlier_free_df[i].astype(int)                                  # at least one feature that is below the 1-percentile or
    q_u = outlier_free_df[i].quantile(0.99)                                 # it is above the 99th percentile
    q_l = outlier_free_df[i].quantile(0.01)
    outlier_free_df = outlier_free_df[outlier_free_df[i] <= q_u]
    outlier_free_df = outlier_free_df[outlier_free_df[i] >= q_l]

X_train = outlier_free_df.iloc[:, :-1].to_numpy()
y_train = outlier_free_df.iloc[:, -1].to_numpy()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)
X_train_standard = scaler_standard.fit_transform(X_train)
X_test_standard = scaler_standard.fit_transform(X_test)

print("Running models without outliers present: ")

# Run gaussian naive bayes
y_pred, model = naivebayes.run_naive_bayes(X_train, X_test, y_train)
y_pred_minmax, model = naivebayes.run_naive_bayes(X_train_minmax, X_test_minmax, y_train)
y_pred_standard, model = naivebayes.run_naive_bayes(X_train_standard, X_test_standard, y_train)
acc = metrics.accuracy_scorer(y_test, y_pred)
acc_minmax = metrics.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = metrics.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}\n")

# Choose best model with cross-validation
max_acc = 0
for i in range(5):
    X_train, X_test, y_train, y_test = split_data(original_df, i)
    best_model = []
    
    # Remove outliers
    train_set = np.concatenate([X_train, y_train[:,None]], axis=1) # Add y_train array as last column of X_train 2d                                                                 # array
    outlier_free_df = pd.DataFrame(train_set)                      # Create pandas dataframe of features and labels
    length = len(train_set[0]) - 1
    for j in range(length):                                        # For each feature in the set, remove each row that 
                                                                   # has at least one feature that is below the 
                                                                   # 1-percentile or it is above the 99th percentile
        outlier_free_df[j] = outlier_free_df[j].astype(int)                                  
        q_u = outlier_free_df[j].quantile(0.99)                                 
        q_l = outlier_free_df[j].quantile(0.01)
        outlier_free_df = outlier_free_df[outlier_free_df[j] <= q_u]
        outlier_free_df = outlier_free_df[outlier_free_df[j] >= q_l]
    X_train = outlier_free_df.iloc[:, :-1].to_numpy()
    y_train = outlier_free_df.iloc[:, -1].to_numpy()
    
    y_pred, model = naivebayes.run_naive_bayes(X_train, X_test, y_train)
    acc = metrics.accuracy_scorer(y_test, y_pred)
    print("Split", i + 1, "accuracy:", acc)
    if acc > max_acc:
        max_acc = acc
        best_model = model
        
print("Best accuracy model after cross-validation: ", max_acc)    

Running models with outliers present: 
Accuracy Gaussian Naive Bayes: 0.6258226261360075
Accuracy Gaussian Naive Bayes min max normalized: 0.6261360075211533
Accuracy Gaussian Naive Bayes standardized: 0.6114070824193043

Running models without outliers present: 
Accuracy Gaussian Naive Bayes: 0.6505797555625196
Accuracy Gaussian Naive Bayes min max normalized: 0.6443121278596051
Accuracy Gaussian Naive Bayes standardized: 0.6255092447508618

Split 1 accuracy: 0.6505797555625196
Split 2 accuracy: 0.6527734252585397
Split 3 accuracy: 0.6546537135694139
Split 4 accuracy: 0.6367909746161078
Split 5 accuracy: 0.6562107904642409
Best accuracy model after cross-validation:  0.6562107904642409
