In [1]:
# Funciones importadas

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier       
from sklearn import metrics
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE


### Lectura de csv

In [2]:
# Función de lectura de csv para añadir la funcionalidad de que ponga la columna nombre como index

def read_file(file):
    file = pd.read_csv(file)
    file.set_index(('Name'), inplace=True)
    return file

top5_college = read_file('../input/model_dummies_college_nba.csv')
draft19_class = read_file('../input/model_draft19_class.csv')

### Aplicación de modelos

In [3]:
# Función para aplicar modelos de machine learning y determinar su accuracy y su confusion matrix.

def modelos(data,columnadep):
    
    X=data.loc[:,data.columns!= columnadep]
    y=data[columnadep]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Definición de modelos                                          # Modelos aplicados: 
    
    cls = svm.SVC(gamma='auto', probability=True)                    # 1. Linear regresion
    lr = LogisticRegression(solver ='liblinear',max_iter=500)        # 2. Logistic regresion
    neigh3 = KNeighborsClassifier(n_neighbors=3)                     # 3. k-Neighbours k=3 
    neigh5 = KNeighborsClassifier(n_neighbors=5)                     # 4. k-Neighbours k=5 
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')    # 5. RandomForest 
    gnb = GaussianNB()                                               # 6. Gaussian Method
    svc = SVC(kernel='rbf', gamma='scale')                           # 7. SVC
    gbc=GradientBoostingClassifier()                                 # 8. GradientBoostingClassifier
    
    # Lista de modelos que usamos
    
    modelos=[cls, lr, neigh3, neigh5, rf, gnb, svc, gbc]
    
    # Bucle para aplicación de modelos
    
    for modelo in modelos:
        modelo.fit(X_train, y_train)
        y_predict = modelo.predict(X_test)
        print('{}'.format(modelo.__class__.__name__))
        print('Accuracy:',"\n", metrics.accuracy_score(y_test, y_predict))
        print("confusion matrix:","\n",confusion_matrix(y_test, y_predict))
        print('---------------------')

modelos(top5_college, 'TOP-5_Top5')

SVC
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[99  0]
 [16  0]]
---------------------
LogisticRegression
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[99  0]
 [16  0]]
---------------------
KNeighborsClassifier
Accuracy: 
 0.8347826086956521
confusion matrix: 
 [[96  3]
 [16  0]]
---------------------
KNeighborsClassifier
Accuracy: 
 0.8521739130434782
confusion matrix: 
 [[98  1]
 [16  0]]
---------------------
RandomForestClassifier
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[99  0]
 [16  0]]
---------------------
GaussianNB
Accuracy: 
 0.7739130434782608
confusion matrix: 
 [[85 14]
 [12  4]]
---------------------
SVC
Accuracy: 
 0.8608695652173913
confusion matrix: 
 [[99  0]
 [16  0]]
---------------------
GradientBoostingClassifier
Accuracy: 
 0.8521739130434782
confusion matrix: 
 [[97  2]
 [15  1]]
---------------------


### Aplicación de over sampling  - creación de valores sinteticos

In [None]:
# Para valancear el modelo creamos valores sinteticos por medio de over sampling

def oversampling(data,columnadep):
    
    # Definimos cual será X e y en el modelo
    
    X = data
    y = data[columnadep]

    X_resample, y_resample = SVMSMOTE().fit_resample(X, y)
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resample,
                                                                y_resample,
                                                                test_size = 0.20, 
                                                                random_state = 10)

    # Definimos el modelo con el que generamos valores sinteticos
    
    rf=RandomForestClassifier(n_estimators=500, criterion='gini')
    rf.fit(X, y)

    # Generamos con RandomForest los valores sinteticos

    y_pred_over = rf.predict(X_test_r)

    # Creamos un DataFrame con los valores y le ponemos los mismas columnas que nuestro anterior DataFrame
    
    over_samplingDF = pd.DataFrame(X_test_r)
    
    colheaders = data.columns   
    dictiona = {x: y for x, y in zip(range(18), colheaders)}
    over_samplingDF = over_samplingDF.rename(index = str, columns=dictiona)
    
    # Print para ver el DataFrame resultante
    
#    print('Modelo:')
#    display(over_samplingDF.head())
#    print('Shape del modelo:')
#    display(over_samplingDF.shape)
    
    return over_samplingDF

over_samplingDF = oversampling(top5_college, 'TOP-5_Top5')
print(over_samplingDF)

### Aplicación de Gridsearch

In [None]:
# Hacemos un gridsearch para obtener los parámetros óptimos para el modelo de RandomForest

def gridsearch(data, columnadep):

    X = data.loc[:,over_samplingDF.columns!= columnadep]
    y = data[columnadep]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    rfc = RandomForestClassifier(random_state=42)                                 # Modelo seleccionado
    
    param_grid = {                                                              # Parámetros del gridsearch
       'n_estimators': [200, 500],
       'max_features': ['auto', 'sqrt'],
       'max_depth' : [7,8],
       'criterion' :['gini', 'entropy']}

    fitting = rfc.fit(X_test,y_test)

    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train, y_train)
    CV_rfc.best_params_
    return CV_rfc.best_params_
    

### Gridsearch aplicado a RF de over sampling

In [None]:
# Con los parámetros obtenidos en el Gridsearch entrenamos el modelo

def gridsearch_model(data, columnadep):
    
    X=data.loc[:,over_samplingDF.columns!= columnadep]
    y=data[columnadep]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    rf=RandomForestClassifier(criterion=gridsearch['criterion'], max_depth=gridsearch['max_depth'], 
                              max_features=gridsearch['max_features'], n_estimators=gridsearch['n_estimators'])
    rf.fit(X_train, y_train) 

    y_pred_rf= rf.predict(X_test)

#    print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred_rf))
#    print("Confusion matrix rf","\n",confusion_matrix(y_test, y_pred_rf))
    
    return y_pred_rf

### Aplicación del modelo al draft de 2019

In [None]:
# Para determinar los 5 jugadores que el modelo selecciona como más probables en el draft de 2019 creamos un bucle que repita el entrenamiento anterior 1000 veces
# Ponemos un contador que expresa en valor 1 si ese jugador sería seleccionable entre los 5 primeros del draft 2019

def aplicacion_final(data, columnadep, datadraft):

    counter=0
    
    for i in range(100):
        
        # Modelo RandomForest generado
        
        X=data.loc[:,data.columns!= columnadep]
        y=data[columnadep]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        rf=RandomForestClassifier(criterion=gridsearch['criterion'], max_depth=gridsearch['max_depth'], 
                              max_features=gridsearch['max_features'], n_estimators=gridsearch['n_estimators'])
        
        rf.fit(X_train, y_train) 
        y_pred_rf= rf.predict(X_test)

        # Modelo RandomForest aplicado para predecir el draft
        
        counter += rf.predict(draft19_class)
        
    return counter


In [None]:
# Con los valores obtenidos en el paso anterior creamos una columna, ordenamos esta según la columna Prediction y extraemos los 5 primeros valores

def final(counter):

    draft19_class['Prediction'] = counter

    prediction = draft19_class.sort_values(by=['Prediction'], ascending=False)
    
    return prediction.head(10)

In [None]:
top5_college = read_file('../input/model_dummies_college_nba.csv')
draft19_class = read_file('../input/model_draft19_class.csv')
over_samplingDF = oversampling(top5_college, 'TOP-5_Top5')
gridsearch = gridsearch(over_samplingDF, 'TOP-5_Top5')
gridsearch2 = gridsearch_model(over_samplingDF, 'TOP-5_Top5')
counter = aplicacion_final(over_samplingDF, 'TOP-5_Top5', draft19_class)
final(counter)