In [1]:
# Funciones importadas

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import requests
import lxml.html as lh
from bs4 import BeautifulSoup
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier       
from sklearn import metrics
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE


### Lectura de csv

In [2]:
# Función de lectura de csv para añadir la funcionalidad de que ponga la columna nombre como index

def read_file(file):
    file = pd.read_csv(file)
    file.set_index(('Name'), inplace=True)
    return file

# Importación de DataFrames

top5_college = read_file('../input/model_dummies_college_nba.csv')
draft19_class = read_file('../input/model_draft19_class.csv')


### Aplicación del modelo

In [3]:
# Función para aplicar modelos de machine learning y determinar su accuracy y su confusion matrix.
# Modelos aplicados: 
    # 1. Linear regresion 
    # 2. Logistic regresion 
    # 3. k-Neighbours k=3 
    # 4. k-Neighbours k=5 
    # 5. RandomForest 
    # 6. Gaussian Method
    # 7. SVC
    # 8. GradientBoostingClassifier

def modelos(data,columnadep):
    X=data.loc[:,data.columns!= columnadep]
    y=data[columnadep]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Definición de modelos
    
    cls = svm.SVC(gamma='auto', probability=True)
    lr = LogisticRegression(solver ='liblinear',max_iter=500)
    neigh3 = KNeighborsClassifier(n_neighbors=3)
    neigh5 = KNeighborsClassifier(n_neighbors=5)
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')
    gnb = GaussianNB()
    svc = SVC(kernel='rbf', gamma='scale')
    gbc=GradientBoostingClassifier()
    
    # Entrenamiento de modelos
    
    cls.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    neigh3.fit(X_train, y_train) 
    neigh5.fit(X_train, y_train) 
    rf.fit(X_train, y_train) 
    gnb.fit(X_train,y_train)
    svc.fit(X_train,y_train)
    gbc.fit(X_train, y_train)
    
    #Predicción de modelos
    
    y_predcls = cls.predict(X_test)    
    y_predlr = lr.predict(X_test)
    y_pred_neigh3= neigh3.predict(X_test)
    y_pred_neigh5= neigh5.predict(X_test)
    y_pred_rf= rf.predict(X_test)
    y_pred_gnd = gnb.predict(X_test)
    y_pred_svc = svc.predict(X_test)
    y_pred_gbc=gbc.predict(X_test)
    
    # Outputs
    
    # Accuracy
    
    print("Accuracy lineal:",metrics.accuracy_score(y_test, y_predcls))
    print("Accuracy logistics:",metrics.accuracy_score(y_test, y_predlr))
    print("Accuracy neigh3:",metrics.accuracy_score(y_test, y_pred_neigh3))
    print("Accuracy neigh5:",metrics.accuracy_score(y_test, y_pred_neigh5))
    print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
    print("Accuracy gnd:",metrics.accuracy_score(y_test, y_pred_gnd))
    print("Accuracy svc:",metrics.accuracy_score(y_test, y_pred_svc))
    print("Accuracy gbc:",metrics.accuracy_score(y_test, y_pred_gbc))
    
    # Confusion matrix

    print("confusion matrix lineal","\n",confusion_matrix(y_test, y_predcls))
    print("confusion matrix logistica","\n",confusion_matrix(y_test, y_predlr))
    print("confusion matrix neigh3","\n",confusion_matrix(y_test, y_pred_neigh3))
    print("confusion matrix neigh5","\n",confusion_matrix(y_test, y_pred_neigh5))
    print("confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))
    print("confusion matrix gnd","\n",confusion_matrix(y_test, y_pred_gnd))
    print("confusion matrix svc","\n",confusion_matrix(y_test, y_pred_svc))
    print("confusion matrix gbc","\n", confusion_matrix(y_test, y_pred_gbc))

In [4]:
modelos(top5_college, 'TOP-5_Top5')

Accuracy lineal: 0.9043478260869565
Accuracy logistics: 0.8608695652173913
Accuracy neigh3: 0.8956521739130435
Accuracy neigh5: 0.8869565217391304
Accuracy rf: 0.9043478260869565
Accuracy gnd: 0.8434782608695652
Accuracy svc: 0.9043478260869565
Accuracy gbc: 0.8956521739130435
matriz de confusion lineal 
 [[104   0]
 [ 11   0]]
matriz de confusion logistica 
 [[99  5]
 [11  0]]
matriz de confusion vecino3 
 [[102   2]
 [ 10   1]]
matriz de confusion vecino5 
 [[102   2]
 [ 11   0]]
matriz de confusion rf 
 [[104   0]
 [ 11   0]]
matriz de confusion gnd 
 [[95  9]
 [ 9  2]]
matriz de confusion svc 
 [[104   0]
 [ 11   0]]
matriz de confusion gbc 
 [[103   1]
 [ 11   0]]


In [5]:
# Usaremos RandomForest al ser uno de los que mejor accuracy tiene

### Aplicación de over sampling

In [6]:
# Nuestras matrices de confusión nos indican que tenemos datos descompensados, por tanto debemos generar datos sinteticos para entrenar al modelo

def over_sampling(X, y, cls):
    
    #Prueba algunos metodos para corregir el over-sampling
    
    # Devuelve un diccionario con los valores
    
    sampling = {}
    
    metodos = {'SMOTE' : SMOTE(),
               'ADASYN' : ADASYN(),
               'SVMSMOTE': SVMSMOTE()}
    
    for e in metodos:
        X_resample, y_resample = metodos[e].fit_resample(X, y)
        X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                                    y_resample,
                                                                    test_size = 0.20, 
                                                                    random_state = 10)

        rf.fit(X_train_r, y_train_r)
        rf_predict = rf.predict(X_test_r)
        fpr, tpr, thresholds = metrics.roc_curve(y_test_r, rf_predict)
        area = metrics.auc(fpr, tpr)
        sampling[e] = [area, fpr, tpr]
        
    return sampling

In [7]:
# Creamos copia de top5_college y definimos X e y para el modelo
# Tras esto aplicamos RandomForest llamando a la función

top5_college = top5_college.copy()

X = top5_college
y = top5_college['TOP-5_Top5']

rf=RandomForestClassifier(n_estimators=500, criterion='gini')

metodos_sampling = over_sampling(X, y, rf) # Llamada a la función con X, y y randomforest
metodos_sampling

{'SMOTE': [1.0, array([0., 0., 1.]), array([0., 1., 1.])],
 'ADASYN': [1.0, array([0., 0., 1.]), array([0., 1., 1.])],
 'SVMSMOTE': [1.0, array([0., 0., 1.]), array([0., 1., 1.])]}

### Utilizamos RandomForest con el over sampling

In [8]:
# Usamos RandomForest con el over sampling para obtener un DataFrame con todos los valores sinteticos

X_resample, y_resample = SVMSMOTE().fit_resample(X, y)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                            y_resample,
                                                            test_size = 0.20, 
                                                            random_state = 10)

rf=RandomForestClassifier(n_estimators=500, criterion='gini')
rf.fit(X, y)

# Aplicamos el modelo

y_pred_over = rf.predict(X_test_r)

over_samplingDF = pd.DataFrame(X_test_r)
over_samplingDF.head()

print('Modelo:')
display(over_samplingDF.head())
print('Shape del modelo:')
display(over_samplingDF.shape)

Modelo:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,33.9,5.1,0.467,4.6,0.375,3.5,0.731,4.1,6.9,1.9,0.3,3.6,2.3,12.5,0.0,1.0,1.0
1,31.316777,14.679119,0.564374,2.77924,0.377959,8.54997,0.772251,12.454134,1.195836,1.275015,1.587508,2.904164,2.624985,26.216657,1.0,0.0,1.0
2,35.4,7.8,0.524,5.0,0.383,4.0,0.782,5.8,2.8,1.3,0.4,2.1,1.8,17.0,1.0,0.0,0.0
3,32.2,9.4,0.579,0.2,0.0,4.5,0.564,8.4,0.8,0.6,1.5,2.5,2.1,13.4,0.0,0.0,0.0
4,30.608761,14.945583,0.574004,2.572606,0.386639,8.809205,0.786368,12.836525,1.018114,1.245434,1.72732,3.02732,2.672754,26.927542,1.0,0.0,1.0


Shape del modelo:


(206, 17)

In [9]:
# Aplicamos las columnas que tiene el DataFrame 'top5_college' al nuevo DataFrame

colheaders = top5_college.columns   
    
dictiona = {x: y for x, y in zip(range(18), colheaders)}

over_samplingDF = over_samplingDF.rename(index = str, columns=dictiona)
over_samplingDF.head()

Unnamed: 0,MP,2PA,2P%,3PA,3P%,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS,POS_Forward,POS_Guard,TOP-5_Top5
0,33.9,5.1,0.467,4.6,0.375,3.5,0.731,4.1,6.9,1.9,0.3,3.6,2.3,12.5,0.0,1.0,1.0
1,31.316777,14.679119,0.564374,2.77924,0.377959,8.54997,0.772251,12.454134,1.195836,1.275015,1.587508,2.904164,2.624985,26.216657,1.0,0.0,1.0
2,35.4,7.8,0.524,5.0,0.383,4.0,0.782,5.8,2.8,1.3,0.4,2.1,1.8,17.0,1.0,0.0,0.0
3,32.2,9.4,0.579,0.2,0.0,4.5,0.564,8.4,0.8,0.6,1.5,2.5,2.1,13.4,0.0,0.0,0.0
4,30.608761,14.945583,0.574004,2.572606,0.386639,8.809205,0.786368,12.836525,1.018114,1.245434,1.72732,3.02732,2.672754,26.927542,1.0,0.0,1.0


### Aplicación de Gridsearch

In [10]:
# Hacemos un gridsearch para obtener los parámetros óptimos para el modelo de RandomForest

X=over_samplingDF.loc[:,over_samplingDF.columns!= 'TOP-5_Top5']
y=over_samplingDF['TOP-5_Top5']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rfc=RandomForestClassifier(random_state=42)

param_grid = {
   'n_estimators': [200, 500],
   'max_features': ['auto', 'sqrt'],
   'max_depth' : [7,8],
   'criterion' :['gini', 'entropy']}

fitting = rf.fit(X_test,y_test)

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_



{'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 'auto',
 'n_estimators': 500}

### Gridsearch aplicado a RF de over sampling

In [11]:
# Con los parámetros obtenidos en el Gridsearch entrenamos el modelo

X=over_samplingDF.loc[:,over_samplingDF.columns!= 'TOP-5_Top5']
y=over_samplingDF['TOP-5_Top5']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf=RandomForestClassifier(criterion='entropy', max_depth=7, max_features='auto', n_estimators=500)
rf.fit(X_train, y_train) 

y_pred_rf= rf.predict(X_test)

print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
print("Confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))

Accuracy rf: 0.9285714285714286
Confusion matrix rf 
 [[23  1]
 [ 2 16]]


### Aplicación del modelo al draft de 2019

In [12]:
# Para determinar los 5 jugadores que el modelo selecciona como más probables en el draft de 2019 creamos un bucle que repita el entrenamiento anterior 1000 veces
# Ponemos un contador que expresa en valor 1 si ese jugador sería seleccionable entre los 5 primeros del draft 2019

counter=0
accuracy=0

for i in range(100):
    X=over_samplingDF.loc[:,over_samplingDF.columns!= 'TOP-5_Top5']
    y=over_samplingDF['TOP-5_Top5']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    rf=RandomForestClassifier(criterion='gini', max_depth=7, max_features='auto', n_estimators=200)
    rf.fit(X_train, y_train) 

    y_pred_rf= rf.predict(X_test)

#    accuracy += ("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
#    print("Confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))
    
    counter += rf.predict(draft19_class)
    

In [16]:
# Con los valores obtenidos en el paso anterior creamos una columna, ordenamos esta según la columna Prediction y extraemos los 5 primeros valores

draft19_class['Prediction'] = counter

prediction = draft19_class.sort_values(by=['Prediction'], ascending=False)
prediction.head()

Unnamed: 0_level_0,MP,2PA,2P%,3PA,3P%,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS,POS_Forward,POS_Guard,Prediction
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DEDRIC LAWSON,32.6,11.6,0.511,2.5,0.393,5.7,0.815,10.3,1.7,1.3,1.1,2.3,2.6,19.4,0,1,91.0
ZION WILLIAMSON,30.0,11.0,0.747,2.2,0.338,6.2,0.64,8.9,2.1,2.1,1.8,2.4,2.1,22.6,1,0,90.0
R.J. BARRETT,35.3,12.2,0.529,6.2,0.308,5.9,0.665,7.6,4.3,0.9,0.4,3.2,1.8,22.6,1,0,71.0
JA MORANT,35.3,8.9,0.545,3.8,0.343,6.3,0.81,6.1,8.2,1.4,0.6,3.8,1.7,18.7,0,1,71.0
BOL BOL,29.8,11.9,0.57,2.8,0.52,4.1,0.757,9.6,1.0,0.8,2.7,2.0,1.7,21.0,0,0,70.0
