In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.preprocessing import StandardScaler

In [2]:
index_clean = []
df_list = []
for i in range(15):
    df = pd.read_csv(f"Datasets/Duplicated{i+1}MeanPastMatches.csv")
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    index_limpio = set(df["index"])

    df_list.append(df)
    index_clean.append(index_limpio)

In [3]:
init_set = index_clean[0]
for i in index_clean:
    init_set = init_set.intersection(i)
final_set = list(init_set)

In [4]:
len(final_set)

62117

In [5]:
# En este caso X_train, X_val y X_test son listas de dataframes
X_train, X_val, X_test = [], [], []
y_train, y_val, y_test = [], [], []

for i in range(15):
    df = df_list[i][df_list[i]["index"].isin(final_set)]
    df = pd.get_dummies(df, columns=["surface", "player1_hand", "player2_hand", "best_of"], dtype=int)
    X = df.drop(["label"],axis = 1)
    y = df["label"]
    
    scaler = StandardScaler()

    X_train_df = X[X["tourney_date"] < 20150101].drop("tourney_date", axis=1) # 2000 - 2014
    X_val_df = X[(X["tourney_date"]>= 20150101) & (X["tourney_date"] < 20190101)].drop("tourney_date", axis=1) # 2015 - 2018
    X_test_df = X[X["tourney_date"] >= 20190101].drop("tourney_date", axis=1) # 2018 - 2024

    y_train_aux, y_val_aux, y_test_aux = y[X_train_df.index], y[X_val_df.index], y[X_test_df.index]
    
    # Normalizacion
    X_train_df = scaler.fit_transform(X_train_df)
    X_val_df = scaler.transform(X_val_df)
    X_test_df = scaler.transform(X_test_df)
    
    X_train.append(X_train_df)
    X_val.append(X_val_df)
    X_test.append(X_test_df)

    y_train.append(y_train_aux)
    y_val.append(y_val_aux)
    y_test.append(y_test_aux)



    

In [6]:
total = len(X_train[1]) + len(X_val[1]) + len(X_test[1])
print(f"Tamaño de los sets: \n Set training {len(X_train[1])/total} \n Set val {len(X_val[1])/total} \n Set test {len(X_test[1])/total}")


Tamaño de los sets: 
 Set training 0.6466506753384742 
 Set val 0.17185311589419966 
 Set test 0.18149620876732617


In [7]:
# Veamos el desbalance en cada particion, deberia ser totalmente balanceada

print(sum(y_train[0]==1) / len(y_train[0]))

print(sum(y_val[0]==1) / len(y_val[0]))

print(sum(y_test[0]==1) / len(y_test[0]))

0.5
0.5
0.5


In [8]:
#Ahora que tenemos los mejores hiperparámetros, hacemos un modelo a partir de ese
class MetaEnsamble:

    def __init__(self, hiperparameters):
        self.hiperparameters = hiperparameters

    def fitAdaBoost(self, X_train, y_train):
        self.models = []
        for i in range(len(self.hiperparameters)):
            depth, t = self.hiperparameters[i+1]
            model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=t, random_state=0)
            model.fit(X_train[i], y_train[i])
            self.models.append(model)

    def fit(self, X_train, y_train, mtype = "logistic", nu = 0.1, depth = 2, n_estimators = 10): # mtype es el tipo de MetaEnsamble a probar
        X = []
        for i in range(len(self.models)):
            prediction = self.models[i].predict(X_train[i])
            X.append(pd.DataFrame(prediction))
        X = pd.concat(X, axis=1)

        if mtype == "logistic":
            self.model = LogisticRegression()
            self.model.fit(X, y_train[0])

        if mtype == "mlp":
            self.model = MLPClassifier(hidden_layer_sizes=(16, 16, 16), activation= "relu", max_iter=2000, random_state=0)
            self.model.fit(X, y_train[0])
        
        if mtype == "svm-linear":
            self.model = NuSVC(nu = nu)
            self.model.fit(X, y_train[0])

        if mtype == "svm-rbf":
            self.model = NuSVC(nu = nu, kernel="rbf")
            self.model.fit(X, y_train[0])

        if mtype == "adaboost":
            self.model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=n_estimators, random_state=0)
            self.model.fit(X, y_train[0])

    def predict(self, X_test):
        X = []
        for i in range(len(self.models)):
            prediction = self.models[i].predict(X_test[i])
            X.append(pd.DataFrame(prediction))
        X = pd.concat(X, axis=1)
        return self.model.predict(X)

In [9]:
# Hiperparámetros ya calculados:

# Sin normalización
# hiperparameters = {1: (2, 27), 2: (3, 24), 3: (2, 21), 4: (3, 30), 
#                     5: (2, 18), 6: (3, 30), 7: (2, 30), 8: (2, 27), 
#                     9: (2, 27), 10: (3, 15), 11: (2, 24), 12: (3, 15), 
#                     13: (2, 30), 14: (2, 27), 15: (2, 27)}

# Con normalización
hiperparameters = {1: (2, 27), 2: (3, 24), 3: (2, 21), 4: (3, 30), 
                   5: (2, 42), 6: (2, 36), 7: (2, 33), 8: (2, 42), 
                   9: (2, 45), 10: (3, 15), 11: (2, 24), 12: (3, 15), 
                   13: (2, 42), 14: (2, 27), 15: (2, 45)}

In [10]:
#Ahora, hacemos el meta-ensamble:
modelLogistic = MetaEnsamble(hiperparameters)
modelLogistic.fitAdaBoost(X_train, y_train)
modelLogistic.fit(X_train, y_train, mtype="logistic")

y_pred = modelLogistic.predict(X_val)
print(accuracy_score(y_val[0], y_pred))
print(confusion_matrix(y_val[0], y_pred))
print(classification_report(y_val[0], y_pred))

0.6693208430913349
[[7098 3577]
 [3483 7192]]
              precision    recall  f1-score   support

           0       0.67      0.66      0.67     10675
           1       0.67      0.67      0.67     10675

    accuracy                           0.67     21350
   macro avg       0.67      0.67      0.67     21350
weighted avg       0.67      0.67      0.67     21350



In [11]:
#Ahora, hacemos el meta-ensamble:
modelMLP = MetaEnsamble(hiperparameters)
modelMLP.fitAdaBoost(X_train, y_train)
modelMLP.fit(X_train, y_train, mtype="mlp")

y_pred = modelMLP.predict(X_val)
print(accuracy_score(y_val[0], y_pred))
print(confusion_matrix(y_val[0], y_pred))
print(classification_report(y_val[0], y_pred))

0.6645433255269321
[[7057 3618]
 [3544 7131]]
              precision    recall  f1-score   support

           0       0.67      0.66      0.66     10675
           1       0.66      0.67      0.67     10675

    accuracy                           0.66     21350
   macro avg       0.66      0.66      0.66     21350
weighted avg       0.66      0.66      0.66     21350



# Testeo

In [12]:
# Test Logistic

y_pred = modelLogistic.predict(X_test)
print(accuracy_score(y_test[0], y_pred))
print(confusion_matrix(y_test[0], y_pred))
print(classification_report(y_test[0], y_pred))

0.6462657441901721
[[7341 3933]
 [4043 7231]]
              precision    recall  f1-score   support

           0       0.64      0.65      0.65     11274
           1       0.65      0.64      0.64     11274

    accuracy                           0.65     22548
   macro avg       0.65      0.65      0.65     22548
weighted avg       0.65      0.65      0.65     22548



In [13]:
# Test MLP

y_pred = modelMLP.predict(X_test)
print(accuracy_score(y_test[0], y_pred))
print(confusion_matrix(y_test[0], y_pred))
print(classification_report(y_test[0], y_pred))

0.640722015256342
[[7283 3991]
 [4110 7164]]
              precision    recall  f1-score   support

           0       0.64      0.65      0.64     11274
           1       0.64      0.64      0.64     11274

    accuracy                           0.64     22548
   macro avg       0.64      0.64      0.64     22548
weighted avg       0.64      0.64      0.64     22548

