In [107]:
import pandas as pd
import numpy as np
from copy import deepcopy

from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC

In [108]:
index_clean = []
df_list = []
for i in range(15):
    df = pd.read_csv(f"Datasets/Duplicated{i+1}MeanPastMatches.csv")
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    index_limpio = set(df["index"])

    df_list.append(df)
    index_clean.append(index_limpio)

In [96]:
init_set = index_clean[0]
for i in index_clean:
    init_set = init_set.intersection(i)
final_set = list(init_set)

In [97]:
len(final_set)

62093

In [98]:
# En este caso X_train, X_val y X_test son listas de dataframes
X_train, X_val, X_test = [], [], []
y_train, y_val, y_test = [], [], []

for i in range(15):
    df = df_list[i][df_list[i]["index"].isin(final_set)]
    df = pd.get_dummies(df, columns=["surface", "player1_hand", "player2_hand", "best_of"], dtype=int)
    X = df.drop(["label"],axis = 1)
    y = df["label"]

    X_train_df = X[X["tourney_date"] < 20150101].drop("tourney_date", axis=1) # 2000 - 2014
    X_val_df = X[(X["tourney_date"]>= 20150101) & (X["tourney_date"] < 20190101)].drop("tourney_date", axis=1) # 2015 - 2018
    X_test_df = X[X["tourney_date"] >= 20190101].drop("tourney_date", axis=1) # 2018 - 2024

    y_train_aux, y_val_aux, y_test_aux = y[X_train_df.index], y[X_val_df.index], y[X_test_df.index]
    
    X_train.append(X_train_df)
    X_val.append(X_val_df)
    X_test.append(X_test_df)

    y_train.append(y_train_aux)
    y_val.append(y_val_aux)
    y_test.append(y_test_aux)



    

In [99]:
total = len(X_train[1]) + len(X_val[1]) + len(X_test[1])
print(f"Tamaño de los sets: \n Set training {len(X_train[1])/total} \n Set val {len(X_val[1])/total} \n Set test {len(X_test[1])/total}")


Tamaño de los sets: 
 Set training 0.6465140998180149 
 Set val 0.17191954004477156 
 Set test 0.18156636013721353


In [100]:
# Veamos el desbalance en cada particion, deberia ser totalmente balanceada

print(sum(y_train[0]==1) / len(y_train[0]))

print(sum(y_val[0]==1) / len(y_val[0]))

print(sum(y_test[0]==1) / len(y_test[0]))

0.5
0.5
0.5


# Modelos

In [106]:
range_T = [3*(i+1) for i in range(10)] # Modelos base
hiperparameters = {}
for i in range(len(X_train)):
    print(f"\nSet número {i+1}")
    # Por cada info
    set_X_train = X_train[i]
    set_X_val = X_val[i]

    set_y_train = y_train[i]
    set_y_val = y_val[i]

    best_tuple = ()
    best_acc = 0


    for t_ in range_T:
        clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=t_, random_state=0)
        clf.fit(set_X_train, set_y_train)
        y_pred = clf.predict(set_X_val)
        acc = accuracy_score(set_y_val, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_tuple = (2 , t_)
            print(f"Nuevo mejor modelo {best_tuple}, con {best_acc}")
        
    for t_ in range_T:
        clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=t_, random_state=0)
        clf.fit(set_X_train, set_y_train)
        y_pred = clf.predict(set_X_val)
        acc = accuracy_score(set_y_val, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_tuple = (3 , t_)
            print(f"Nuevo mejor modelo {best_tuple}, con {best_acc}")
    for t_ in range_T:
        clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), n_estimators=t_, random_state=0)
        clf.fit(set_X_train, set_y_train)
        y_pred = clf.predict(set_X_val)
        acc = accuracy_score(set_y_val, y_pred)
        if acc > best_acc:
            best_acc = acc
            best_tuple = (4 , t_)
            print(f"Nuevo mejor modelo {best_tuple}, con {best_acc}")

    #Guardamos los hiperparámetros
    hiperparameters[i+1] = best_tuple



Set número 1
Nuevo mejor modelo (2, 3), con 0.6453864168618267
Nuevo mejor modelo (2, 6), con 0.6466510538641687
Nuevo mejor modelo (2, 9), con 0.6492740046838408
Nuevo mejor modelo (2, 12), con 0.6518032786885246
Nuevo mejor modelo (2, 15), con 0.6550351288056206

Set número 2
Nuevo mejor modelo (2, 3), con 0.6439812646370023
Nuevo mejor modelo (2, 6), con 0.6457611241217799
Nuevo mejor modelo (2, 9), con 0.6506791569086651
Nuevo mejor modelo (2, 12), con 0.6513348946135832
Nuevo mejor modelo (2, 15), con 0.6523185011709602
Nuevo mejor modelo (2, 21), con 0.6527400468384075
Nuevo mejor modelo (2, 24), con 0.6532084309133489
Nuevo mejor modelo (2, 27), con 0.6534426229508197

Set número 3
Nuevo mejor modelo (2, 3), con 0.6441686182669789
Nuevo mejor modelo (2, 6), con 0.645480093676815
Nuevo mejor modelo (2, 9), con 0.6533021077283372
Nuevo mejor modelo (2, 12), con 0.6540046838407494
Nuevo mejor modelo (2, 15), con 0.6543325526932084
Nuevo mejor modelo (2, 18), con 0.6553161592505855

KeyboardInterrupt: 

In [93]:
#Ahora que tenemos los mejores hiperparámetros, hacemos un modelo a partir de ese
class MetaEnsamble:

    def __init__(self, hiperparameters):
        self.hiperparameters = hiperparameters

    def fitAdaBoost(self, X_train, y_train):
        self.models = []
        for i in range(len(self.hiperparameters)):
            depth, t = self.hiperparameters[i+1]
            model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=t, random_state=0)
            model.fit(X_train[i], y_train[i])
            self.models.append(model)

    def fit(self, X_train, y_train, mtype = "logistic", nu = 0.1, depth = 2, n_estimators = 10): # mtype es el tipo de MetaEnsamble a probar
        X = []
        for i in range(len(self.models)):
            prediction = self.models[i].predict(X_train[i])
            X.append(pd.DataFrame(prediction))
        X = pd.concat(X, axis=1)

        if mtype == "logistic":
            self.model = LogisticRegression()
            self.model.fit(X, y_train[0])

        if mtype == "mlp":
            self.model = MLPClassifier(hidden_layer_sizes=(64, 64, 64), activation= "relu", max_iter=1000, random_state=0)
            self.model.fit(X, y_train[0])
        
        if mtype == "svm-linear":
            self.model = NuSVC(nu = nu)
            self.model.fit(X, y_train[0])

        if mtype == "adaboost":
            self.model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=depth), n_estimators=n_estimators, random_state=0)
            self.model.fit(X, y_train[0])

    def predict(self, X_test):
        X = []
        for i in range(len(self.models)):
            prediction = self.models[i].predict(X_test[i])
            X.append(pd.DataFrame(prediction))
        X = pd.concat(X, axis=1)
        return self.model.predict(X)

In [None]:
# hiperparameters = {1:(2, 15), 2:(2, 35), 3:(2, 20),
#                    4:(2, 25), 5:(2, 15), 6:(2, 20),
#                    7:(2, 20), 8:(2, 25), 9:(2, 35),
#                    10:(2, 15)}

In [None]:
#Ahora, hacemos el meta-ensamble:
modelLogistic = MetaEnsamble(hiperparameters)
modelLogistic.fitAdaBoost(X_train, y_train)
modelLogistic.fit(X_train, y_train, mtype="logistic")

y_pred = modelLogistic.predict(X_val)
accuracy_score(y_val[0], y_pred)

0.6585948477751756

In [68]:
#Ahora, hacemos el meta-ensamble:
modelMLP = MetaEnsamble(hiperparameters)
modelMLP.fitAdaBoost(X_train, y_train)
modelMLP.fit(X_train, y_train, mtype="mlp")

y_pred = modelMLP.predict(X_val)
accuracy_score(y_val[0], y_pred)

0.6544730679156908

In [74]:
#Ahora, hacemos el meta-ensamble:
modelSVM = MetaEnsamble(hiperparameters)
modelSVM.fitAdaBoost(X_train, y_train)
modelSVM.fit(X_train, y_train, mtype="svm-linear", nu = 0.1)

y_pred = modelSVM.predict(X_val)
accuracy_score(y_val[0], y_pred)

0.6431850117096019

In [None]:
range_T = [3*(i+1) for i in range(10)] # Modelos base

for t_ in range_T:
    modelSVM = MetaEnsamble(hiperparameters)
    modelSVM.fitAdaBoost(X_train, y_train)
    modelSVM.fit(X_train, y_train, mtype="adaboost", depth = 2, n_estimators= t_)

    y_pred = modelSVM.predict(X_val)
    print(f"Depth {2}, estimators {t_}")
    print(accuracy_score(y_val[0], y_pred))

    modelSVM = MetaEnsamble(hiperparameters)
    modelSVM.fitAdaBoost(X_train, y_train)
    modelSVM.fit(X_train, y_train, mtype="adaboost", depth = 3, n_estimators= t_)

    y_pred = modelSVM.predict(X_val)
    print(f"Depth {3}, estimators {t_}")
    print(accuracy_score(y_val[0], y_pred))

    modelSVM = MetaEnsamble(hiperparameters)
    modelSVM.fitAdaBoost(X_train, y_train)
    modelSVM.fit(X_train, y_train, mtype="adaboost", depth = 4, n_estimators= t_)

    print(f"Depth {4}, estimators {t_}")
    y_pred = modelSVM.predict(X_val)
    print(accuracy_score(y_val[0], y_pred))