In [4]:
import pandas as pd
import Functions

original_df = pd.read_csv("./FIFA18_players_database/CompleteDataset.csv", low_memory=False)
original_df = Functions.engineer_features(original_df)
original_df = original_df.sample(frac=1).reset_index(drop=True)
X_train, X_test, y_train, y_test = Functions.split_data(original_df, 0)
X_train_minmax = Functions.normalize(X_train, "minmax")
X_test_minmax = Functions.normalize(X_test, "minmax")
X_train_standard = Functions.normalize(X_train, "standard")
X_test_standard = Functions.normalize(X_test, "standard")

print("Running models with outliers present: ")

# Run gaussian naive bayes
y_pred = Functions.run_naive_bayes(X_train, X_test, y_train)
y_pred_minmax = Functions.run_naive_bayes(X_train_minmax, X_test_minmax, y_train)
y_pred_standard = Functions.run_naive_bayes(X_train_standard, X_test_standard, y_train)
acc = Functions.accuracy_scorer(y_test, y_pred)
acc_minmax = Functions.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = Functions.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}\n")

# Remove outliers
outlier_free_df = Functions.remove_outliers(original_df, 1)

print("Running models without outliers present: ")

# Run gaussian naive bayes
X_train, X_test, y_train, y_test = Functions.split_data(outlier_free_df, 0)
X_train_minmax = Functions.normalize(X_train, "minmax")
X_test_minmax = Functions.normalize(X_test, "minmax")
X_train_standard = Functions.normalize(X_train, "standard")
X_test_standard = Functions.normalize(X_test, "standard")
y_pred = Functions.run_naive_bayes(X_train, X_test, y_train)
y_pred_minmax = Functions.run_naive_bayes(X_train_minmax, X_test_minmax, y_train)
y_pred_standard = Functions.run_naive_bayes(X_train_standard, X_test_standard, y_train)
acc = Functions.accuracy_scorer(y_test, y_pred)
acc_minmax = Functions.accuracy_scorer(y_test, y_pred_minmax)
acc_standard = Functions.accuracy_scorer(y_test, y_pred_standard)
print(f"Accuracy Gaussian Naive Bayes: {acc}")
print(f"Accuracy Gaussian Naive Bayes min max normalized: {acc_minmax}")
print(f"Accuracy Gaussian Naive Bayes standardized: {acc_standard}\n")

# Choose best model with cross-validation
max_acc = 0
for i in range(5):
    X_train, X_test, y_train, y_test = Functions.split_data(outlier_free_df, i)
    
    y_pred = Functions.run_naive_bayes(X_train, X_test, y_train)
    acc = Functions.accuracy_scorer(y_test, y_pred)
    
    print("Split", i + 1, "accuracy:", acc)
    
    if acc > max_acc:
        max_acc = acc
        
print("Best accuracy model after cross-validation: ", max_acc)    

Running models with outliers present: 
Accuracy Gaussian Naive Bayes: 0.6352240676903792
Accuracy Gaussian Naive Bayes min max normalized: 0.6305233469131933
Accuracy Gaussian Naive Bayes standardized: 0.6214352867439674

Running models without outliers present: 
Accuracy Gaussian Naive Bayes: 0.6557377049180327
Accuracy Gaussian Naive Bayes min max normalized: 0.6557377049180327
Accuracy Gaussian Naive Bayes standardized: 0.6492796820665673

Split 1 accuracy: 0.6557377049180327
Split 2 accuracy: 0.6373571783407849
Split 3 accuracy: 0.6289120715350224
Split 4 accuracy: 0.6313959264778937
Split 5 accuracy: 0.6383084577114427
Best accuracy model after cross-validation:  0.6557377049180327
