In [3]:
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [4]:
#import data from file
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/nba_logreg.csv',
                         format='com.databricks.spark.csv',
                         header='true', inferSchema='true')

In [5]:
df=df.drop('3P Made').drop('3PA').drop('3P%').drop('FTM').drop('FTA').drop('FT%')
df.columns
# GP-games played, MIN-minutes played, PTS-PointsPerGame, FGM-FieldGoalsMade, FGA-FieldGoalAttemps
# FG%-FieldGoalPercent, REB-Rebounds, AST-Assists, STL-Steals, BLK-Blocks, TOV-Turnovers, 
# TARGET_5Yrs-Outcome (1-carrer length>=5 / 0-else)

['Name',
 'GP',
 'MIN',
 'PTS',
 'FGM',
 'FGA',
 'FG%',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'TARGET_5Yrs']

In [6]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- GP: integer (nullable = true)
 |-- MIN: double (nullable = true)
 |-- PTS: double (nullable = true)
 |-- FGM: double (nullable = true)
 |-- FGA: double (nullable = true)
 |-- FG%: double (nullable = true)
 |-- OREB: double (nullable = true)
 |-- DREB: double (nullable = true)
 |-- REB: double (nullable = true)
 |-- AST: double (nullable = true)
 |-- STL: double (nullable = true)
 |-- BLK: double (nullable = true)
 |-- TOV: double (nullable = true)
 |-- TARGET_5Yrs: double (nullable = true)



In [7]:
# change the type of GP from integer to double
from pyspark.sql.types import DoubleType
df = df.withColumn("GP", df["GP"].cast(DoubleType()))

In [8]:
#define the columns in the players data used for the decision tree classifier
featureColumns = ['GP','MIN','PTS','FGM','FGA','FG%','OREB','DREB','REB','AST','STL','BLK','TOV']

In [9]:
#drop missing data
df=df.na.drop()

In [10]:
# see description of data 
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
GP,1340,60.41417910447761,17.43399224457866,11.0,82.0
MIN,1340,17.624626865671644,8.307963702156513,3.1,40.9
PTS,1340,6.801492537313432,4.357544941826061,0.7,28.2
FGM,1340,2.6291044776119397,1.6835550700573203,0.3,10.2
FGA,1340,5.885298507462688,3.593488466403574,0.8,19.8
FG%,1340,44.16940298507466,6.1376788738469825,23.8,73.7
OREB,1340,1.0094029850746271,0.7771193635112831,0.0,5.3
DREB,1340,2.025746268656715,1.360007510448923,0.2,9.6
REB,1340,3.0344776119403005,2.057774041584701,0.3,13.9


In [11]:
# count rows and columns 
# 13 columns are going to be used to classify the outcome, 
# we won't use Name and the last column is the value we want to predict
df.count(),len(df.columns)

(1340, 15)

In [12]:
# analyse which variable are most likely to have the most influence on the outcome
df.toPandas().corr()
# the highest correlation with the outcome have values of GP,MIN,PTS,FGM - this is logical

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
GP,1.0,0.59024,0.538471,0.542724,0.516625,0.296289,0.401136,0.46684,0.460406,0.372749,0.451137,0.276498,0.518167,0.396833
MIN,0.59024,1.0,0.911822,0.90306,0.910247,0.203901,0.573062,0.745513,0.709707,0.629015,0.757034,0.399088,0.8265,0.317805
PTS,0.538471,0.911822,1.0,0.990834,0.979733,0.255333,0.575106,0.693934,0.676849,0.552338,0.675341,0.387043,0.850366,0.315981
FGM,0.542724,0.90306,0.990834,1.0,0.98005,0.291693,0.596687,0.703278,0.691186,0.532534,0.66264,0.398125,0.834352,0.317594
FGA,0.516625,0.910247,0.979733,0.98005,1.0,0.129798,0.504212,0.640123,0.614328,0.589818,0.690168,0.322184,0.845989,0.29266
FG%,0.296289,0.203901,0.255333,0.291693,0.129798,1.0,0.511367,0.410555,0.465423,-0.108797,0.056658,0.391626,0.121806,0.227134
OREB,0.401136,0.573062,0.575106,0.596687,0.504212,0.511367,1.0,0.83858,0.932694,-0.012109,0.286545,0.648346,0.421695,0.293307
DREB,0.46684,0.745513,0.693934,0.703278,0.640123,0.410555,0.83858,1.0,0.978177,0.186679,0.411894,0.688135,0.570187,0.284677
REB,0.460406,0.709707,0.676849,0.691186,0.614328,0.465423,0.932694,0.978177,1.0,0.119081,0.381154,0.699672,0.536716,0.299406
AST,0.372749,0.629015,0.552338,0.532534,0.589818,-0.108797,-0.012109,0.186679,0.119081,1.0,0.751289,-0.08647,0.747286,0.175353


In [13]:
binarizer = Binarizer(threshold=0.5, inputCol="TARGET_5Yrs", outputCol="label")
binarizedDF = binarizer.transform(df)

In [14]:
#aggregate the features used to make predictions into a single column
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(binarizedDF)

In [15]:
# lets check that previous step created the vector as last column and 
# create new DataFrame that we are going to use in DecisionTreeClassifier
WorkingDF=assembled.select('features','label')
WorkingDF.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[36.0,27.4,7.4,2....|  0.0|
|[35.0,26.9,7.2,2....|  0.0|
|[74.0,15.3,5.2,2....|  0.0|
+--------------------+-----+
only showing top 3 rows



In [16]:
# split our data into training data and test data
(trainingData, testData) = WorkingDF.randomSplit([0.8,0.2],seed=13234)

In [17]:
# count the number of rows in training data and test data
trainingData.count(),testData.count()

(1072, 268)

In [18]:
# Creating initial Model
dt=DecisionTreeClassifier(labelCol="label",featuresCol="features",maxDepth=3)

In [20]:
# creating grid for parameters for cross validation
dtparamGrid=(ParamGridBuilder()
             .addGrid(dt.maxDepth,[3,5,7,10,15,20,25])
             .addGrid(dt.minInstancesPerNode,[10,15,20,25,30,50,100])
             .build())

In [21]:
# evaluating Model
dtevaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

In [22]:
# create CrossValidator
dtcross=CrossValidator(estimator=dt,estimatorParamMaps=dtparamGrid, evaluator=dtevaluator,numFolds=7)

In [23]:
# run cross valudations
dtcvModel=dtcross.fit(trainingData)

In [24]:
print(dtcvModel)

CrossValidatorModel_45a384b2db4de267e077


In [25]:
# using test data to measure the accuracy of the model
dtPredictions=dtcvModel.transform(testData)

In [26]:
# AUC - characteristic of the accuracy of distinguishing between classes
print('Accuracy',dtevaluator.evaluate(dtPredictions))
print('AUC:',BinaryClassificationMetrics(dtPredictions['label','prediction'].rdd).areaUnderROC)


Accuracy 0.6485458872699832
AUC: 0.6451688555347092


In [27]:
# lets see the first 10 rows  
# we can see the prediction matches the input 'label'
predict=dtPredictions.select("prediction","label")
predict.show(10)

+----------+-----+
|prediction|label|
+----------+-----+
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
|       0.0|  0.0|
+----------+-----+
only showing top 10 rows



In [28]:
#compute accuracy of predictions
#the first two arguments specify the names of the label and prediction columns
#the third argument specifies that we want the overall precision
evaluator = MulticlassClassificationEvaluator(labelCol="label",
predictionCol="prediction", metricName="precision")

In [29]:
#The MulticlassMetrics class can be used to generate a confusion matrix of our classifier model
#because MulticlassMetrics works with RDDs of numbers and not DataFrames, we need to convert our predictions DataFrame into an RDD
metrics = MulticlassMetrics(predict.rdd.map(tuple))

In [30]:
metrics.confusionMatrix().toArray().transpose()

array([[  60.,   44.],
       [  47.,  117.]])