In [0]:
# Importation de la base
nba_players_df = spark.read.csv("dbfs:/FileStore/all_seasons.csv", header=True, sep=";")
nba_players_df.show()

In [0]:
nba_players_df.groupBy('draft_round').count().show()
nba_players_df.columns

In [0]:
#On séléctionne les colonnes que l'on souhaite pour la prédiction
nba_players_df = nba_players_df.select(['draft_round','team_abbreviation','age','player_height','player_weight','country', 
                                        'draft_year', 'draft_number','gp','pts','reb','ast','net_rating','oreb_pct','dreb_pct','usg_pct', 'ts_pct', 'ast_pct','season'])

In [0]:
#On supprimes les lignes avec des données nulles
nba_players_df = nba_players_df.na.drop()
print(str(nba_players_df.count()) + ' lignes restantes')

On a que des variables qualitatives. Donc on fera un modèle de classification.

In [0]:
# On constate que toutes nos variables sont des string, il faut les transformer en int ou float pour la prédiction.
nba_players_df.printSchema()

In [0]:
from pyspark.sql.functions import col

to_float = ['player_height','player_weight','pts','reb','ast','net_rating','oreb_pct','dreb_pct','usg_pct','ts_pct','ast_pct' , 'draft_year', 'draft_number']
to_int = ['age', 'draft_round', 'gp']

for col in to_float:
    
    nba_players_df=nba_players_df.withColumn(col,nba_players_df[col].cast("float").alias(col))

for col in to_int:
    
    nba_players_df=nba_players_df.withColumn(col,nba_players_df[col].cast("int").alias(col))
    
nba_players_df.printSchema()


In [0]:
# Maintenant il faut transformer nos string en int (classification)
from pyspark.sql import functions as F

# COLONNE SAISON
distinct_season = nba_players_df.select(F.collect_set('season').alias('season')).first()['season']

dict_season = {}
i = 0
for element in distinct_season:
    dict_season[element] = i
    i += 1

# TEAM ABBREVATION
distinct_ABV = nba_players_df.select(F.collect_set('team_abbreviation').alias('team_abbreviation')).first()['team_abbreviation']

dict_ABV = {}
i = 0
for element in distinct_ABV:
    dict_ABV[element] = i
    i += 1
    
# COUNTRY
distinct_country = nba_players_df.select(F.collect_set('country').alias('country')).first()['country']

dict_country = {}
i = 0
for element in distinct_country:
    dict_country[element] = i
    i += 1
    


In [0]:
# On remplace les colonnes qualitatives en quantitatives
nba_players_df = nba_players_df.rdd.map(lambda x:(x.draft_round, dict_ABV[x.team_abbreviation],x.age,x.player_height,x.player_weight, dict_country[x.country],x.draft_year, x.draft_number, x.gp,x.pts,x.reb,x.ast,x.net_rating,x.oreb_pct,x.dreb_pct,x.usg_pct,x.ts_pct,x.ast_pct,dict_season[x.season])).toDF(['draft_round','team_abbreviation','age','player_height','player_weight','country','draft_year', 'draft_number','gp','pts','reb','ast','net_rating','oreb_pct','dreb_pct','usg_pct','ts_pct','ast_pct','season'])

nba_players_df

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

In [0]:
# Construction d'un vecteur regroupant les données utiles à la prédiction
df_assembler=VectorAssembler(inputCols=['team_abbreviation','age','player_height','player_weight','country','draft_year', 'draft_number','gp','pts','reb','ast','net_rating','oreb_pct','dreb_pct','usg_pct','ts_pct','ast_pct','season'], outputCol="features")
df_final = df_assembler.setHandleInvalid("skip").transform(nba_players_df)

In [0]:
train_df, test_df = df_final.randomSplit([0.80, 0.20])
print("Training Dataset count : " + str(train_df.count()))
print("Test Dataset count : " + str(test_df.count()))

In [0]:

#Construction du modèle
lr = LogisticRegression(featuresCol  = 'features',labelCol='draft_round').fit(train_df)

#Prédiction
predictions=lr.transform(test_df)
#Exemple de resultats
predictions.select("draft_round", "features", "prediction", "probability").toPandas()


Unnamed: 0,draft_round,features,prediction,probability
0,1,"[0.0, 20.0, 213.36000061035156, 112.0372238159...",1.0,"[7.990189994873599e-47, 0.9999992692513762, 7...."
1,1,"[0.0, 21.0, 213.36000061035156, 112.0372238159...",1.0,"[1.1602331573703006e-47, 0.9999995290868536, 4..."
2,1,"[0.0, 22.0, 193.0399932861328, 87.996849060058...",1.0,"[4.98774664184655e-34, 0.9999999999953904, 4.6..."
3,1,"[0.0, 23.0, 187.9600067138672, 79.378601074218...",1.0,"[5.5006654659473055e-40, 1.0, 1.10105990170458..."
4,1,"[0.0, 23.0, 208.27999877929688, 106.5941162109...",1.0,"[3.28596281616338e-39, 1.0, 3.4544431729237577..."
...,...,...,...,...
1980,4,"[16.0, 32.0, 210.82000732421875, 108.862083435...",3.0,"[7.296008519719282e-158, 5.481127576023919e-78..."
1981,4,"[16.0, 38.0, 210.82000732421875, 117.933921813...",8.0,"[4.1338937848025423e-125, 1.7366561296684287e-..."
1982,4,"[26.0, 35.0, 210.82000732421875, 117.933921813...",4.0,"[5.595817469063735e-146, 7.949888821720569e-73..."
1983,7,"[1.0, 36.0, 195.5800018310547, 102.05819702148...",7.0,"[0.0, 7.476719163100785e-234, 8.58832730786981..."


In [0]:
#Evaluation du modèle et métriques

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create both evaluators
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="draft_round", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(labelCol="draft_round", rawPredictionCol="prediction", metricName='areaUnderROC')

# Make predicitons
predictionAndTarget = lr.transform(df_final).select("draft_round", "prediction")

# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluator.evaluate(predictionAndTarget)

tp = predictionAndTarget[(predictionAndTarget.draft_round == 1) & (predictionAndTarget.prediction == 1)].count()
tn = predictionAndTarget[(predictionAndTarget.draft_round == 0) & (predictionAndTarget.prediction == 0)].count()
fp = predictionAndTarget[(predictionAndTarget.draft_round == 0) & (predictionAndTarget.prediction == 1)].count()
fn = predictionAndTarget[(predictionAndTarget.draft_round == 1) & (predictionAndTarget.prediction == 0)].count()
print ("True Positives :", tp)
print ("True Negatives :", tn)
print ("False Positives :", fp)
print ("False Negatives :", fn)


print(f'The area under de ROC : {round(auc*100,2)}%')
print(f'Accuracy : {round(acc*100,2)}%')
print(f'F1 Score : {round(f1*100,2)}%')
print(f'Precision : {round(weightedPrecision*100,2)}%')
print(f'Recall : {round(weightedRecall*100,2)}%')
