In [1]:
# Funciones importadas

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import requests
import lxml.html as lh
from bs4 import BeautifulSoup
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier       
from sklearn import metrics
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE


### Lectura de csv

In [2]:
# Función de lectura de csv para añadir la funcionalidad de que ponga la columna nombre como index

def read_file(file):
    file = pd.read_csv(file)
    file.set_index(('Name'), inplace=True)
    return file

# Importación de DataFrames

all_star_college = read_file('../input/model_dummies_allstar.csv')
draft19_class = read_file('../input/model_draft19_class.csv')


### Aplicación del modelo

In [5]:
# Función para aplicar modelos de machine learning y determinar su accuracy y su confusion matrix.
# Modelos aplicados: 
    # 1. Linear regresion 
    # 2. Logistic regresion 
    # 3. k-Neighbours k=3 
    # 4. k-Neighbours k=5 
    # 5. RandomForest 
    # 6. Gaussian Method
    # 7. SVC
    # 8. GradientBoostingClassifier

def modelos(data,columnadep):
    X=data.loc[:,data.columns!= columnadep]
    y=data[columnadep]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Definición de modelos
    
    cls = svm.SVC(gamma='auto', probability=True)
    lr = LogisticRegression(solver ='liblinear',max_iter=500)
    neigh3 = KNeighborsClassifier(n_neighbors=3)
    neigh5 = KNeighborsClassifier(n_neighbors=5)
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')
    gnb = GaussianNB()
    svc = SVC(kernel='rbf', gamma='scale')
    gbc=GradientBoostingClassifier()
    
    # Entrenamiento de modelos
    
    cls.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    neigh3.fit(X_train, y_train) 
    neigh5.fit(X_train, y_train) 
    rf.fit(X_train, y_train) 
    gnb.fit(X_train,y_train)
    svc.fit(X_train,y_train)
    gbc.fit(X_train, y_train)
    
    #Predicción de modelos
    
    y_predcls = cls.predict(X_test)    
    y_predlr = lr.predict(X_test)
    y_pred_neigh3= neigh3.predict(X_test)
    y_pred_neigh5= neigh5.predict(X_test)
    y_pred_rf= rf.predict(X_test)
    y_pred_gnd = gnb.predict(X_test)
    y_pred_svc = svc.predict(X_test)
    y_pred_gbc=gbc.predict(X_test)
    
    # Outputs
    
    # Accuracy
    
    print("Accuracy lineal:",metrics.accuracy_score(y_test, y_predcls))
    print("Accuracy logistics:",metrics.accuracy_score(y_test, y_predlr))
    print("Accuracy neigh3:",metrics.accuracy_score(y_test, y_pred_neigh3))
    print("Accuracy neigh5:",metrics.accuracy_score(y_test, y_pred_neigh5))
    print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
    print("Accuracy gnd:",metrics.accuracy_score(y_test, y_pred_gnd))
    print("Accuracy svc:",metrics.accuracy_score(y_test, y_pred_svc))
    print("Accuracy gbc:",metrics.accuracy_score(y_test, y_pred_gbc))
    
    # Confusion matrix

    print("confusion matrix lineal","\n",confusion_matrix(y_test, y_predcls))
    print("confusion matrix logistica","\n",confusion_matrix(y_test, y_predlr))
    print("confusion matrix neigh3","\n",confusion_matrix(y_test, y_pred_neigh3))
    print("confusion matrix neigh5","\n",confusion_matrix(y_test, y_pred_neigh5))
    print("confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))
    print("confusion matrix gnd","\n",confusion_matrix(y_test, y_pred_gnd))
    print("confusion matrix svc","\n",confusion_matrix(y_test, y_pred_svc))
    print("confusion matrix gbc","\n", confusion_matrix(y_test, y_pred_gbc))

In [6]:
modelos(all_star_college, 'ALLSTAR_Yes')

Accuracy lineal: 0.9304347826086956
Accuracy logistics: 0.9130434782608695
Accuracy neigh3: 0.9130434782608695
Accuracy neigh5: 0.9130434782608695
Accuracy rf: 0.9130434782608695
Accuracy gnd: 0.8260869565217391
Accuracy svc: 0.9304347826086956
Accuracy gbc: 0.8956521739130435
matriz de confusion lineal 
 [[103   4]
 [  6   2]]
matriz de confusion vecino3 
 [[105   2]
 [  8   0]]
matriz de confusion vecino5 
 [[105   2]
 [  8   0]]
matriz de confusion rf 
 [[105   2]
 [  8   0]]
matriz de confusion gnd 
 [[93 14]
 [ 6  2]]
matriz de confusion svc 
 [[107   0]
 [  8   0]]
matriz de confusion gbc 
 [[102   5]
 [  7   1]]


In [None]:
# Usaremos RandomForest al ser uno de los que mejor accuracy tiene

### Aplicación de over sampling

In [7]:
# Nuestras matrices de confusión nos indican que tenemos datos descompensados, por tanto debemos generar datos sinteticos para entrenar al modelo

def over_sampling(X, y, cls):
    
    #Prueba algunos metodos para corregir el over-sampling
    
    # Devuelve un diccionario con los valores
    
    sampling = {}
    
    metodos = {'SMOTE' : SMOTE(),
               'ADASYN' : ADASYN(),
               'SVMSMOTE': SVMSMOTE()}
    
    for e in metodos:
        X_resample, y_resample = metodos[e].fit_resample(X, y)
        X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                                    y_resample,
                                                                    test_size = 0.20, 
                                                                    random_state = 10)

        rf.fit(X_train_r, y_train_r)
        rf_predict = rf.predict(X_test_r)
        fpr, tpr, thresholds = metrics.roc_curve(y_test_r, rf_predict)
        area = metrics.auc(fpr, tpr)
        sampling[e] = [area, fpr, tpr]
        
    return sampling

In [8]:
# Creamos copia de top5_college y definimos X e y para el modelo
# Tras esto aplicamos RandomForest llamando a la función

all_star_college = all_star_college.copy()

X = all_star_college
y = all_star_college['ALLSTAR_Yes']

rf=RandomForestClassifier(n_estimators=500, criterion='gini')

metodos_sampling = over_sampling(X, y, rf) # Llamada a la función con X, y y randomforest
metodos_sampling

### Utilizamos RandomForest con el over sampling

In [10]:
# Usamos RandomForest con el over sampling para obtener un DataFrame con todos los valores sinteticos

X_resample, y_resample = SVMSMOTE().fit_resample(X, y)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                            y_resample,
                                                            test_size = 0.20, 
                                                            random_state = 10)

rf=RandomForestClassifier(n_estimators=500, criterion='gini')
rf.fit(X, y)

# Aplicamos el modelo

y_pred_over = rf.predict(X_test_r)

over_samplingDF = pd.DataFrame(X_test_r)
over_samplingDF.head()

print('Modelo:')
display(over_samplingDF.head())
print('Shape del modelo:')
display(over_samplingDF.shape)

In [12]:
# Aplicamos las columnas que tiene el DataFrame 'top5_college' al nuevo DataFrame

colheaders = top5_college.columns   
    
dictiona = {x: y for x, y in zip(range(18), colheaders)}

over_samplingDF = over_samplingDF.rename(index = str, columns=dictiona)
over_samplingDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,28.2,5.8,0.488,1.1,0.338,3.8,0.688,4.7,2.8,0.9,0.4,2.1,2.1,9.5,9.51,0.0,1.0,0.0
1,32.356318,6.837908,0.503195,6.775817,0.39413,5.706137,0.86869,4.337545,3.537545,1.368954,0.218773,2.512636,2.156318,19.85777,-2.041046,1.0,0.0,1.0
2,31.0,4.9,0.611,4.1,0.354,3.6,0.788,5.4,2.4,1.7,1.0,2.5,2.4,13.2,1.39,0.0,1.0,0.0
3,28.1,4.7,0.511,3.8,0.433,1.8,0.7,8.0,0.8,0.7,1.5,1.0,2.3,10.9,10.24,0.0,1.0,0.0
4,29.208422,8.350532,0.523779,2.532624,0.331526,5.133688,0.700462,4.382092,4.809486,1.191578,0.408422,2.733688,1.691578,14.841046,5.658936,1.0,-0.08422,1.08422


### Aplicación de Gridsearch

In [18]:
# Hacemos un gridsearch para obtener los parámetros óptimos para el modelo de RandomForest

X=over_samplingDF.loc[:,over_samplingDF.columns!= 'ALLSTAR_Yes']
y=over_samplingDF['ALLSTAR_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rfc=RandomForestClassifier(random_state=42)

param_grid = {
   'n_estimators': [200, 500],
   'max_features': ['auto', 'sqrt'],
   'max_depth' : [7,8],
   'criterion' :['gini', 'entropy']}

fitting = rf.fit(X_test,y_test)

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_



{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 200}

### Gridsearch aplicado a RF de over sampling

In [19]:
# Con los parámetros obtenidos en el Gridsearch entrenamos el modelo

X=over_samplingDF.loc[:,over_samplingDF.columns!= 'ALLSTAR_Yes']
y=over_samplingDF['ALLSTAR_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf=RandomForestClassifier(criterion='gini', max_depth=7, max_features='auto', n_estimators=200)
rf.fit(X_train, y_train) 

y_pred_rf= rf.predict(X_test)

print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
print("Confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))

Accuracy rf: 0.8571428571428571
Confusion matrix rf 
 [[15  2]
 [ 4 21]]


### Aplicado al draft de 2019

In [23]:
prediction = rf.predict(draft19_class)

In [24]:
# Obtenemos la lista de los jugadores que nuestro modelo predice que serán All Star durante su carrera

draft19_class['Prediction'] = prediction
draft19_class.loc[draft19_class['Prediction']==1]

Unnamed: 0_level_0,MP,2PA,2P%,3PA,3P%,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS,SOS,POS_Forward,POS_Guard,Prediction
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
R.J. BARRETT,35.3,12.2,0.529,6.2,0.308,5.9,0.665,7.6,4.3,0.9,0.4,3.2,1.8,22.6,11.97,1,0,1.0
KY BOWMAN,35.5,7.8,0.466,5.8,0.388,3.9,0.762,6.4,3.9,1.3,0.3,3.0,2.6,16.9,7.68,0,1,1.0
JARRETT CULVER,29.5,7.5,0.522,4.1,0.341,4.2,0.687,5.6,2.8,1.3,0.6,2.1,2.0,14.9,9.56,0,1,1.0
DEDRIC LAWSON,32.6,11.6,0.511,2.5,0.393,5.7,0.815,10.3,1.7,1.3,1.1,2.3,2.6,19.4,7.802727,0,1,1.0
JA MORANT,35.3,8.9,0.545,3.8,0.343,6.3,0.81,6.1,8.2,1.4,0.6,3.8,1.7,18.7,-3.32,0,1,1.0
JAYLEN NOWELL,33.5,9.4,0.504,3.2,0.396,3.5,0.789,4.6,2.9,1.2,0.3,2.7,2.5,16.1,6.54,0,1,1.0
RAYJON TUCKER,36.6,6.9,0.558,5.8,0.411,7.0,0.777,6.7,1.8,1.1,0.4,2.6,2.3,20.3,7.802727,0,1,1.0
ZION WILLIAMSON,30.0,11.0,0.747,2.2,0.338,6.2,0.64,8.9,2.1,2.1,1.8,2.4,2.1,22.6,11.97,1,0,1.0
