### EXPORT CSV TO RDD

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("CSVToLibSVM").getOrCreate()


df = spark.read.csv("Data/Datas/events_shot.csv", header=True, inferSchema=True)

In [4]:
features = ['other_pp','from_fk','from_ti','from_corner','from_counter','from_gk','from_keeper','from_ko',
            'header','corner_type','fk_type','pk_type',
            'half_volley_technique','volley_technique','lob_technique','overhead_technique','backheel_technique','diving_h_technique',
            'distance_to_goal', 'shot_angle', 'preferred_foot_shot', 'under_pressure',
            'shot_aerial_won','shot_first_time','shot_one_on_one','shot_open_goal','shot_follows_dribble','players_inside_area']
target = ['goal']

In [5]:
# Convert each row into LibSVM format
def to_libsvm(row):
    label = row['goal']  # replace with your target column name
    feature_values = [f"{i + 1}:{row[feature]}" for i, feature in enumerate(features)]
    return f"{label} " + " ".join(feature_values)
# Apply the transformation to each row
rdd = df.rdd.map(to_libsvm)
# Save the RDD as a text file
rdd.saveAsTextFile("Data/Datas/libsvm_format.txt")
spark.stop()

                                                                                

### Ensembles RDD BASED API

In [6]:
from pyspark.mllib.util import MLUtils
from pyspark import SparkContext

sc = SparkContext(appName="EnsemblesRDDbasedAPI")

In [8]:
data = MLUtils.loadLibSVMFile(sc, "Data/Datas/libsvm_format.txt")
data.take(5)

                                                                                

[LabeledPoint(0.0, (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0231,33.69006,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0])),
 LabeledPoint(0.0, (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.6031,29.76365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0])),
 LabeledPoint(0.0, (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.4904,35.318363,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0])),
 LabeledPoint(0.0, (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.8516,17.43129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0])),
 LabeledPoint(0.0, (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],

In [None]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

#### GBT

In [10]:
from pyspark.mllib.tree import GradientBoostedTrees
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={},
                                             numIterations=100)

                                                                                

In [11]:

predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
        lambda lp: lp[0] != lp[1]).count() / float(testData.count())

                                                                                

In [13]:
print('Test Error = ' + str(testErr))
accuracy = 1 - testErr
print('Accuracy = ' + str(accuracy))
print('Learned classification GBT model:')
print(model.toDebugString())

Test Error = 0.09467232944465351
Accuracy = 0.9053276705553465
Learned classification GBT model:
TreeEnsembleModel classifier with 100 trees

  Tree 0:
    If (feature 19 <= 36.6488835)
     If (feature 18 <= 18.2753)
      If (feature 27 <= 1.5)
       Predict: -0.6858198496623774
      Else (feature 27 > 1.5)
       Predict: -0.8592620768352985
     Else (feature 18 > 18.2753)
      If (feature 19 <= 16.0231095)
       Predict: -0.9549022882912979
      Else (feature 19 > 16.0231095)
       Predict: -0.8917933130699088
    Else (feature 19 > 36.6488835)
     If (feature 8 <= 0.5)
      If (feature 27 <= 1.5)
       Predict: 0.031717098067808966
      Else (feature 27 > 1.5)
       Predict: -0.6070726915520629
     Else (feature 8 > 0.5)
      If (feature 18 <= 5.874499999999999)
       Predict: -0.35135135135135137
      Else (feature 18 > 5.874499999999999)
       Predict: -0.7629785002621919
  Tree 1:
    If (feature 11 <= 0.5)
     If (feature 25 <= 0.5)
      If (feature 19 <= 18

#### Random Forest

In [14]:
from pyspark.mllib.tree import RandomForest

model = RandomForest.trainClassifier(trainingData,
                                     numClasses=2,
                                     categoricalFeaturesInfo={},
                                     numTrees=100,
                                     featureSubsetStrategy="auto",
                                     impurity='gini',
                                     maxDepth=10,
                                     maxBins=32)

25/01/25 21:49:18 WARN BlockManager: Task 3779 already completed, not releasing lock for rdd_6_0
25/01/25 21:49:22 WARN DAGScheduler: Broadcasting large task binary with size 1219.0 KiB
25/01/25 21:49:22 WARN DAGScheduler: Broadcasting large task binary with size 2021.4 KiB
25/01/25 21:49:23 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/01/25 21:49:25 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
                                                                                

In [15]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())

25/01/25 21:49:38 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
                                                                                

In [16]:
print('Test Error = ' + str(testErr))
accuracy = 1 - testErr
print('Accuracy = ' + str(accuracy))
print('Learned classification forest model:')
print(model.toDebugString())

Test Error = 0.09445728724262137
Accuracy = 0.9055427127573786
Learned classification forest model:
TreeEnsembleModel classifier with 100 trees

  Tree 0:
    If (feature 17 <= 0.5)
     If (feature 19 <= 26.5257175)
      If (feature 13 <= 0.5)
       If (feature 18 <= 19.01775)
        If (feature 24 <= 0.5)
         If (feature 22 <= 0.5)
          If (feature 20 <= 0.5)
           If (feature 14 <= 0.5)
            Predict: 0.0
           Else (feature 14 > 0.5)
            If (feature 18 <= 14.839649999999999)
             Predict: 0.0
            Else (feature 18 > 14.839649999999999)
             If (feature 18 <= 18.17335)
              Predict: 1.0
             Else (feature 18 > 18.17335)
              Predict: 0.0
          Else (feature 20 > 0.5)
           Predict: 0.0
         Else (feature 22 > 0.5)
          Predict: 0.0
        Else (feature 24 > 0.5)
         If (feature 8 <= 0.5)
          If (feature 19 <= 18.6145535)
           If (feature 19 <= 14.3020865)
       