In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import Functions
from Functions import accuracy_scorer

# Read the dataset and extract features and labels we need, remove rows we don't want. Randomize order of the rows.
original_df = pd.read_csv("../FIFA18_players_database/CompleteDataset.csv", low_memory=False)
original_df = Functions.engineer_features(original_df)
original_df = original_df.sample(frac=1).reset_index(drop=True)

# Make copies of each dataframe
NB_df = original_df.copy()
RF_df = original_df.copy()
SVM_df = original_df.copy()

# Normalize data for models that need normalization
SVM_df.iloc[:,:-1] = Functions.normalize(SVM_df.iloc[:,:-1], "minmax", False)

# Remove outliers for models
NB_df = Functions.remove_outliers(NB_df, 1)
RF_df = Functions.remove_outliers(RF_df, 1)
SVM_df = Functions.remove_outliers_iqr(SVM_df)

models = ["Gaussian Naive Bayes", "Random Forest", "Support Vector Machine"]

for model in models:
    max_acc_test = 0
    best_model_acc_train = 0
    for i in range(5):
        if model == "Gaussian Naive Bayes":
            X_train, X_test, y_train, y_test = Functions.split_data(NB_df, i)
            y_test_pred, y_train_pred = Functions.run_naive_bayes(X_train, X_test, y_train)
            acc_test = accuracy_scorer(y_test, y_test_pred)
            acc_train = accuracy_scorer(y_train, y_train_pred)
        elif model == "Random Forest":
            X_train, X_test, y_train, y_test = Functions.split_data(RF_df, i)
            y_test_pred, y_train_pred = Functions.run_RF(X_train, X_test, y_train, 200, 10, 5,"sqrt", 30, False)
            acc_test = accuracy_scorer(y_test, y_test_pred)
            acc_train = accuracy_scorer(y_train, y_train_pred)
        else:
            X_train, X_test, y_train, y_test = Functions.split_data(SVM_df, i)
            y_test_pred, y_train_pred = Functions.run_SVM_classifier(X_train, X_test, y_train, 'rbf', 100, False)
            acc_test = accuracy_scorer(y_test, y_test_pred)
            acc_train = accuracy_scorer(y_train, y_train_pred)
        if acc_test > max_acc_test:
            max_acc_test = acc_test
            best_model_acc_train = acc_train
    print(f"Accuracy of best {model} model:")
    print(f"Test accuracy: {max_acc_test:.4f}")
    print(f"Train accuracy: {best_model_acc_train:.4f}")

Accuracy of best Gaussian Naive Bayes model:
Test accuracy: 0.6552
Train accuracy: 0.3735
Accuracy of best Random Forest model:
Test accuracy: 0.7620
Train accuracy: 0.5713
