##  PUC - MG
##  Ciência de Dados e Big Data - 2019
##  TCC - Gilberto Klingen

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder



## Aquisição de Dados

In [3]:
def obtem_base_reduzida(base):
    
    # Esta função transforma duas colunas de mesmo significado em apenas uma contendo a diferença do valor relativo
    # ao jogador 1 diminuída do valor relativo ao jogador 2
    
    base['pct_surface_victories_1'] = base['pct_surface_victories_1'] - base['pct_surface_victories_2']
    base['pct_best_of_victories_1'] = base['pct_best_of_victories_1'] - base['pct_best_of_victories_2']
    base['avg_ace_1'] = base['avg_ace_1'] - base['avg_ace_2']
    base['avg_df_1'] = base['avg_df_1'] - base['avg_df_2']
    base['avg_svpt_1'] = base['avg_svpt_1'] - base['avg_svpt_2']
    base['avg_1stIn_1'] = base['avg_1stIn_1'] - base['avg_1stIn_2']
    base['avg_1stWon_1'] = base['avg_1stWon_1'] - base['avg_1stWon_2']
    base['avg_2ndWon_1'] = base['avg_2ndWon_1'] - base['avg_2ndWon_2']
    base['avg_SvGms_1'] = base['avg_SvGms_1'] - base['avg_SvGms_2']
    base['avg_bpSaved_1'] = base['avg_bpSaved_1'] - base['avg_bpSaved_2']
    base['avg_bpFaced_1'] = base['avg_bpFaced_1'] - base['avg_bpFaced_2']
    base['age_1'] = base['age_1'] - base['age_2']
    base['rank_points_1'] = base['rank_points_1'] - base['rank_points_2']
    base['pct_victories_grand_slam_1'] = base['pct_victories_grand_slam_1'] - base['pct_victories_grand_slam_2']
    base['pct_victories_master_1000_1'] = base['pct_victories_master_1000_1'] - base['pct_victories_master_1000_2']
    base['pct_victories_ATP_250_500_1'] = base['pct_victories_ATP_250_500_1'] - base['pct_victories_ATP_250_500_2']
    base['pct_victories_finals_1'] = base['pct_victories_finals_1'] - base['pct_victories_finals_2']
    base['qty_grand_slam_titles_1'] = base['qty_grand_slam_titles_1'] - base['qty_grand_slam_titles_2']
    base['qty_master_1000_titles_1'] = base['qty_master_1000_titles_1'] - base['qty_master_1000_titles_2']
    base['qty_ATP_250_500_titles_1'] = base['qty_ATP_250_500_titles_1'] - base['qty_ATP_250_500_titles_2']
    base['qty_finals_titles_1'] = base['qty_finals_titles_1'] - base['qty_finals_titles_2']

    base['pct_player_1_surface_victories'] = base['pct_player_1_surface_victories'] - base['pct_player_2_surface_victories']
    base['pct_player_1_best_of_victories'] = base['pct_player_1_best_of_victories'] - base['pct_player_2_best_of_victories']
    base['pct_player_1_grand_slam_victories'] = base['pct_player_1_grand_slam_victories'] - base['pct_player_2_grand_slam_victories']
    base['pct_player_1_master_1000_victories'] = base['pct_player_1_master_1000_victories'] - base['pct_player_2_master_1000_victories']
    base['pct_player_1_finals_victories'] = base['pct_player_1_finals_victories'] - base['pct_player_2_finals_victories']
    base['pct_player_1_ATP_250_500_victories'] = base['pct_player_1_ATP_250_500_victories'] - base['pct_player_2_ATP_250_500_victories']

    base = base.drop(['pct_surface_victories_2','pct_best_of_victories_2','avg_ace_2','avg_df_2','avg_svpt_2','avg_1stIn_2',
                      'avg_1stWon_2','avg_2ndWon_2','avg_SvGms_2','avg_bpSaved_2','avg_bpFaced_2','age_2','rank_points_2',
                      'pct_victories_grand_slam_2','pct_victories_master_1000_2','pct_victories_ATP_250_500_2',
                      'pct_victories_finals_2','qty_grand_slam_titles_2','qty_master_1000_titles_2','qty_ATP_250_500_titles_2',
                      'qty_finals_titles_2','pct_player_2_surface_victories','pct_player_2_best_of_victories',
                      'pct_player_2_grand_slam_victories','pct_player_2_master_1000_victories','pct_player_2_finals_victories',
                      'pct_player_2_ATP_250_500_victories'],axis=1)
    
    return base

In [4]:
# Obtem a base completa de jogos para análise

base_completa = pd.read_excel(r'C:\Giba\Dados\_A_Giba\Documentos\Adm\PUC_MG\Python\Bases_ML\base_completa.xlsx') 

In [5]:
# Gera a base completa redefinida, onde os valores irão de 0,0 a 1,0 e exclui as colunas não relevantes

base_redefinida = base_completa.drop(['tourney_id','tourney_date','match_num','round','surface','tourney_level',
                                      'best_of','AvgW','AvgL','player_id_1','player_id_2'],axis=1)
base_redefinida = base_redefinida.apply(lambda x: round(x / 100,4))
base_redefinida['rank_points_1'] = base_redefinida['rank_points_1'].apply(lambda x: round(x / 1000,4))
base_redefinida['rank_points_2'] = base_redefinida['rank_points_2'].apply(lambda x: round(x / 1000,4))
base_redefinida['winner'] = base_redefinida['winner'].apply(lambda x: x * 100)
base_redefinida.to_excel(r'C:\Giba\Dados\_A_Giba\Documentos\Adm\PUC_MG\Python\Bases_ML\base_redefinida.xlsx')

In [6]:
# Gera a base completa reduzida, onde as colunas do jogador 1 e 2 foram reduzidas a apenas uma contendo a diferença
# de seus valores

base_ML = obtem_base_reduzida(base_redefinida)

## Gera o Modelo de Machine Learning utilizando o algoritmo de regressão logística

In [None]:
from sklearn.linear_model import LogisticRegression

In [8]:
# Separa as colunas de dimensões da base

X = base_ML.iloc[:,0:(base_ML.shape[1] - 1)]

In [9]:
# Separa a coluna de classe de classificação da base

le = LabelEncoder()
y = le.fit_transform(base_ML.iloc[:,(base_ML.shape[1] - 1)])

In [112]:
# Faz a separação das bases de treinamento e teste (20% da base será para treinamento)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0 )

In [113]:
# Executa o treinamento do modelo

LogisticRegr = LogisticRegression(max_iter=1000)
LogisticRegr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [114]:
# Aplica o modelo na base de teste

y_pred = LogisticRegr.predict(x_test)

In [115]:
# Verifica a acurácia do modelo

score = accuracy_score(y_test, y_pred)
print(score)

0.7799417953884038


In [116]:
# Verifica a performance de classificação do modelo

print(classification_report( y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.81      0.78      4433
           1       0.80      0.75      0.77      4501

    accuracy                           0.78      8934
   macro avg       0.78      0.78      0.78      8934
weighted avg       0.78      0.78      0.78      8934



In [117]:
# Verifica a matriz de confusão do modelo

confusion = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=confusion, index=["Player_1", "Player_2"], columns=["Player_1 (prev)", "Player_2 (prev)"])
print(cnf_table)

          Player_1 (prev)  Player_2 (prev)
Player_1             3585              848
Player_2             1118             3383


In [118]:
# Gera a matriz de probabilidades da base completa

prb = LogisticRegression(random_state=0,max_iter=1000).fit(X, y)
prb.predict_proba(X)

array([[0.04522779, 0.95477221],
       [0.65610069, 0.34389931],
       [0.69694679, 0.30305321],
       ...,
       [0.59641974, 0.40358026],
       [0.32990118, 0.67009882],
       [0.70340496, 0.29659504]])

In [119]:
# Salva a matriz de probabilidades em uma planilha Excel

df_odds = pd.DataFrame(prb.predict_proba(X))
df_odds.to_excel("C:\\Giba\Dados\\_A_Giba\\Documentos\\Adm\\PUC_MG\\Python\\Bases_ML\\odds_predict_logistic_regression.xlsx")

## Gera o Modelo de Machine Learning utilizando o algoritmo de árvore de decisão

In [120]:
# As etapas de separação das colunas de dimensão e da coluna de classe de classificação da base e
# separação das bases de treinamento e teste já foram executadas na criação do modelo através do algoritmo de 
# Logistic Regression

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [121]:
# Executa o treinamento do modelo

# odds_tree = DecisionTreeClassifier(random_state=0, criterion='gini',min_samples_leaf=5,min_samples_split=5,max_depth=None)
odds_tree = DecisionTreeClassifier(random_state=0, criterion='entropy')
odds_tree = odds_tree.fit(x_train, y_train)

In [122]:
# Aplica o modelo na base de teste

y_pred = odds_tree.predict(x_test)

In [123]:
# Verifica a acurácia do modelo

score = accuracy_score(y_test, y_pred)
print(score)

0.9937318110588762


In [124]:
# Verifica a performance de classificação do modelo

print(classification_report(y_test, y_pred, target_names=["Player_1", "Player_2"]))

              precision    recall  f1-score   support

    Player_1       1.00      0.99      0.99      4433
    Player_2       0.99      1.00      0.99      4501

    accuracy                           0.99      8934
   macro avg       0.99      0.99      0.99      8934
weighted avg       0.99      0.99      0.99      8934



In [125]:
# Verifica a matriz de confusão do modelo

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=["Player_1", "Player_2"], columns=["Player_1 (prev)", "Player_2 (prev)"])
print(cnf_table)

          Player_1 (prev)  Player_2 (prev)
Player_1             4399               34
Player_2               22             4479


In [126]:
# Gera a matriz de probabilidades da base completa

prb = DecisionTreeClassifier(random_state=0, criterion='entropy').fit(X, y)
prb.predict_proba(X)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

## Gera o Modelo de Machine Learning utilizando o algoritmo de Naive Bayes

In [None]:
# As etapas de separação das colunas de dimensão e da coluna de classe de classificação da base e
# separação das bases de treinamento e teste já foram executadas na criação do modelo através do algoritmo de 
# Logistic Regression

In [127]:
from sklearn.naive_bayes import GaussianNB

In [128]:
# Executa o treinamento do modelo

gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [129]:
# Aplica o modelo na base de teste

y_pred = gnb.predict(x_test)

In [130]:
# Verifica a acurácia do modelo

score = accuracy_score(y_test, y_pred)
print(score)

0.9975374972017014


In [131]:
# Verifica a performance de classificação do modelo

print(classification_report(y_test, y_pred, target_names=["Player_1", "Player_2"]))

              precision    recall  f1-score   support

    Player_1       1.00      1.00      1.00      4433
    Player_2       1.00      1.00      1.00      4501

    accuracy                           1.00      8934
   macro avg       1.00      1.00      1.00      8934
weighted avg       1.00      1.00      1.00      8934



In [132]:
# Verifica a matriz de confusão do modelo

cnf_matrix = confusion_matrix(y_test, y_pred)
cnf_table = pd.DataFrame(data=cnf_matrix, index=["Player_1", "Player_2"], columns=["Player_1 (prev)", "Player_2 (prev)"])
print(cnf_table)

          Player_1 (prev)  Player_2 (prev)
Player_1             4433                0
Player_2               22             4479


In [133]:
# Gera a matriz de probabilidades da base completa

prb = DecisionTreeClassifier(random_state=0, criterion='entropy').fit(X, y)
prb.predict_proba(X)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])