In [1]:
# Funciones importadas

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier       
from sklearn import metrics
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE


### Lectura de csv

In [2]:
# Función de lectura de csv para añadir la funcionalidad de que ponga la columna nombre como index

def read_file(file):
    file = pd.read_csv(file)
    file.set_index(('Name'), inplace=True)
    return file

top5_college = read_file('../input/model_dummies_college_nba.csv')
draft19_class = read_file('../input/model_draft19_class.csv')
all_star_college = read_file('../input/model_dummies_allstar.csv')

### Aplicación de modelos

In [3]:
# Función para aplicar modelos de machine learning y determinar su accuracy y su confusion matrix.

def modelos(data,columnadep):
    
    X=data.loc[:,data.columns!= columnadep]
    y=data[columnadep]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Definición de modelos                                          # Modelos aplicados: 
    
    cls = svm.SVC(gamma='auto', probability=True)                    # 1. Linear regression
    lr = LogisticRegression(solver ='liblinear',max_iter=500)        # 2. Logistic regression
    neigh3 = KNeighborsClassifier(n_neighbors=3)                     # 3. k-Neighbors k=3 
    neigh5 = KNeighborsClassifier(n_neighbors=5)                     # 4. k-NeighboGrs k=5 
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')    # 5. RandomForest 
    gnb = GaussianNB()                                               # 6. Gaussian Method
    svc = SVC(kernel='rbf', gamma='scale')                           # 7. SVC
    gbc=GradientBoostingClassifier()                                 # 8. GradientBoostingClassifier
    
    # Lista de modelos que usamos
    
    modelos=[cls, lr, neigh3, neigh5, rf, gnb, svc, gbc]
    
    # Bucle para aplicación de modelos
    
    for modelo in modelos:
        modelo.fit(X_train, y_train)
        y_predict = modelo.predict(X_test)
        print('{}'.format(modelo.__class__.__name__))
        print('Accuracy:',"\n", metrics.accuracy_score(y_test, y_predict))
        print("confusion matrix:","\n",confusion_matrix(y_test, y_predict))
        print('---------------------')

modelos(top5_college, 'TOP-5_Top5')

SVC
Accuracy: 
 0.8782608695652174
confusion matrix: 
 [[101   0]
 [ 14   0]]
---------------------
LogisticRegression
Accuracy: 
 0.8869565217391304
confusion matrix: 
 [[100   1]
 [ 12   2]]
---------------------
KNeighborsClassifier
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[98  3]
 [13  1]]
---------------------
KNeighborsClassifier
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[99  2]
 [14  0]]
---------------------
RandomForestClassifier
Accuracy: 
 0.8782608695652174
confusion matrix: 
 [[100   1]
 [ 13   1]]
---------------------
GaussianNB
Accuracy: 
 0.7304347826086957
confusion matrix: 
 [[82 19]
 [12  2]]
---------------------
SVC
Accuracy: 
 0.8782608695652174
confusion matrix: 
 [[101   0]
 [ 14   0]]
---------------------
GradientBoostingClassifier
Accuracy: 
 0.8869565217391304
confusion matrix: 
 [[101   0]
 [ 13   1]]
---------------------


### Aplicación de over sampling  - creación de valores sinteticos

In [4]:
# Para valancear el modelo creamos valores sinteticos por medio de over sampling

def oversampling(data,columnadep):
    
    # Definimos cual será X e y en el modelo
    
    X = data
    y = data[columnadep]

    X_resample, y_resample = SVMSMOTE().fit_resample(X, y)
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                                y_resample,
                                                                test_size = 0.20, 
                                                                random_state = 10)

    # Definimos el modelo con el que generamos valores sinteticos
    
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')
    rf.fit(X, y)

    # Generamos con RandomForest los valores sinteticos

    y_pred_over = rf.predict(X_test_r)

    # Creamos un DataFrame con los valores y le ponemos los mismas columnas que nuestro anterior DataFrame
    
    over_samplingDF = pd.DataFrame(X_test_r)
    
    colheaders = data.columns   
    dictiona = {x: y for x, y in zip(range(18), colheaders)}
    over_samplingDF = over_samplingDF.rename(index = str, columns=dictiona)
    
    # Print para ver el DataFrame resultante
    
#    print('Modelo:')
#    display(over_samplingDF.head())
#    print('Shape del modelo:')
#    display(over_samplingDF.shape)
    
    return over_samplingDF


### Aplicación de Gridsearch

In [5]:
# Hacemos un gridsearch para obtener los parámetros óptimos para el modelo de RandomForest

def gridsearch(data, columnadep):

    X = data.loc[:,over_samplingDF.columns!= columnadep]
    y = data[columnadep]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    rfc = RandomForestClassifier(random_state=42)                                 # Modelo seleccionado
    
    param_grid = {                                                                # Parámetros del gridsearch
       'n_estimators': [200, 500],
       'max_features': ['auto', 'sqrt'],
       'max_depth' : [7,8],
       'criterion' :['gini', 'entropy']}

    fitting = rfc.fit(X_test,y_test)

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train, y_train)
    CV_rfc.best_params_
    return CV_rfc.best_params_
    

### Gridsearch aplicado a RF de over sampling

In [6]:
# Con los parámetros obtenidos en el Gridsearch entrenamos el modelo

def gridsearch_model(data, columnadep):
    
    X=data.loc[:,over_samplingDF.columns!= columnadep]
    y=data[columnadep]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    rf=RandomForestClassifier(criterion=gridsearch['criterion'], max_depth=gridsearch['max_depth'], 
                              max_features=gridsearch['max_features'], n_estimators=gridsearch['n_estimators'])
    rf.fit(X_train, y_train) 

    y_pred_rf= rf.predict(X_test)

#    print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
#    print("Confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))
    
    return y_pred_rf

### Aplicación del modelo al draft de 2019 y All-Star

In [7]:
# Para determinar los 5 jugadores que el modelo selecciona como más probables en el draft de 2019 creamos un bucle que repita el entrenamiento anterior 1000 veces
# Ponemos un contador que expresa en valor 1 si ese jugador sería seleccionable entre los 5 primeros del draft 2019

def final_predict(data, columnadep, datadraft, bucles):

    counter=0
    
    for i in range(bucles):
        
        # Modelo RandomForest generado
        
        X=data.loc[:,data.columns!= columnadep]
        y=data[columnadep]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        rf=RandomForestClassifier(criterion=gridsearch['criterion'], max_depth=gridsearch['max_depth'], 
                              max_features=gridsearch['max_features'], n_estimators=gridsearch['n_estimators'])
        
        rf.fit(X_train, y_train) 
        y_pred_rf= rf.predict(X_test)

        # Modelo RandomForest aplicado para predecir el draft
        
        counter += rf.predict(datadraft)
        
    return counter

# Con los valores obtenidos en el paso anterior creamos una columna, ordenamos esta según la columna Prediction y extraemos los 5 primeros valores

def final(counter, datadraft):

    datadraft['Prediction'] = counter

    prediction = datadraft.loc[datadraft['Prediction']>=1]
    prediction = prediction.sort_values(by=['Prediction'], ascending=False)
    
    return prediction

### Aplicación del modelo a determinar si un jugador será All Star

### Procesamos el DataSet para la visualización final

In [8]:
def final_data(datadraft):
    
    position_player=[]
    for n in range(len(final_prediction['POS_Guard'])):
        if final_prediction['POS_Guard'][n] == 1:
            position_player.append('Guard')
        if final_prediction['POS_Guard'][n] == 0 and final_prediction['POS_Forward'][n] == 1:
            position_player.append('Forward')
        if final_prediction['POS_Guard'][n] == 0 and final_prediction['POS_Forward'][n] == 0:
            position_player.append('Center')

    final_prediction['Position'] = position_player
    final_prediction.drop(['POS_Guard'], axis=1, inplace=True)
    final_prediction.drop(['POS_Forward'], axis=1, inplace=True)
    
    return final_prediction

# Final code

In [9]:
seleccion = input('Selecciona lo que quieres consultar: \n 1: Top 5 Draft \n 2: All Star \n \n')
if seleccion == "1":    
    file = top5_college
    column = 'TOP-5_Top5'
    bucles = 100
    
if seleccion == "2":   
    file = all_star_college
    column = 'ALLSTAR_Yes'   
    bucles = 1

top5_college = read_file('../input/model_dummies_college_nba.csv')
draft19_class = read_file('../input/model_draft19_class.csv')
all_star_college = read_file('../input/model_dummies_allstar.csv')

over_samplingDF = oversampling(file, column)
gridsearch = gridsearch(over_samplingDF, column)
gridsearch2 = gridsearch_model(over_samplingDF, column)
counter = final_predict(over_samplingDF, column, draft19_class, bucles)
final_prediction = final(counter, draft19_class)
final = final_data(final_prediction)

Selecciona lo que quieres consultar: 
 1: Top 5 Draft 
 2: All Star 
 
1




In [14]:
final.head()

Unnamed: 0_level_0,MP,2PA,2P%,3PA,3P%,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS,Prediction,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
JA MORANT,35.3,8.9,0.545,3.8,0.343,6.3,0.81,6.1,8.2,1.4,0.6,3.8,1.7,18.7,100.0,Guard
ZION WILLIAMSON,30.0,11.0,0.747,2.2,0.338,6.2,0.64,8.9,2.1,2.1,1.8,2.4,2.1,22.6,100.0,Forward
R.J. BARRETT,35.3,12.2,0.529,6.2,0.308,5.9,0.665,7.6,4.3,0.9,0.4,3.2,1.8,22.6,99.0,Forward
BOL BOL,29.8,11.9,0.57,2.8,0.52,4.1,0.757,9.6,1.0,0.8,2.7,2.0,1.7,21.0,97.0,Center
DEDRIC LAWSON,32.6,11.6,0.511,2.5,0.393,5.7,0.815,10.3,1.7,1.3,1.1,2.3,2.6,19.4,95.0,Guard


In [11]:
final['Position'].value_counts()

Guard      9
Forward    7
Center     3
Name: Position, dtype: int64