# Ejemplo de transformación, limpieza y procesado de conjuntos de datos

In [3]:
games_df = spark.read.csv('../data/games.csv', header=True, sep=',', inferSchema=True)
games_red_df = games_df.selectExpr('winner',
                                   'gameDuration as duration',
                                   'firstBlood',
                                   'firstTower',
                                   'firstInhibitor',
                                   'firstBaron',
                                   'firstDragon',
                                   'firstRiftHerald',
                                   't1_champ1id',
                                   't1_champ2id',
                                   't1_champ3id',
                                   't1_champ4id',
                                   't1_champ5id',
                                   't1_towerKills',
                                   't1_inhibitorKills',
                                   't1_baronKills',
                                   't1_dragonKills',
                                   't1_riftHeraldKills',
                                   't2_champ1id',
                                   't2_champ2id',
                                   't2_champ3id',
                                   't2_champ4id',
                                   't2_champ5id',
                                   't2_towerKills',
                                   't2_inhibitorKills',
                                   't2_baronKills',
                                   't2_dragonKills',
                                   't2_riftHeraldKills')

In [4]:
from pyspark.sql.functions import expr
from pyspark.ml.feature import CountVectorizer
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
games_red_cv_df = games_red_df.withColumn('t1_members_str',new_column_expression).drop('t1_champ1id', 't1_champ2id', 't1_champ3id', 't1_champ4id', 't1_champ5id')
cv = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
model = cv.fit(games_red_cv_df)
games_red_cv_df=model.transform(games_red_cv_df).drop('t1_members_str')

In [5]:
from pyspark.sql.functions import expr
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
games_red_cv_df = games_red_cv_df.withColumn('t2_members_str',new_column_expression).drop('t2_champ1id', 't2_champ2id', 't2_champ3id', 't2_champ4id', 't2_champ5id')
cv = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
model = cv.fit(games_red_cv_df)
games_red_cv_df=model.transform(games_red_cv_df).drop('t2_members_str')

In [6]:
from pyspark.ml.feature import Binarizer
from pyspark.sql.types import DoubleType
games_red_cv_df = games_red_cv_df.withColumn('winner', games_red_cv_df.winner.cast(DoubleType()))
transformer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
games_red_cv_df=transformer.transform(games_red_cv_df).drop('winner')

In [7]:
from pyspark.ml.feature import OneHotEncoderEstimator
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
model = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
transformer=model.fit(games_red_cv_df)
games_red_cv_df=transformer.transform(games_red_cv_df)
for column in columns :
    games_red_cv_df = games_red_cv_df.drop(column)

In [8]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
games_red_cv_df=assembler.transform(games_red_cv_df)
model = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
transformer = model.fit(games_red_cv_df)
games_red_cv_df=transformer.transform(games_red_cv_df)

In [9]:
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler = VectorAssembler(inputCols=columns, outputCol='features')
games_red_cv_df = assembler.transform(games_red_cv_df)
dataset = games_red_cv_df.selectExpr('winner_b as label', 'features')

In [10]:
from pyspark.ml.feature import PCA
model = PCA(inputCol='features', outputCol='red_features',k=50)
transformer = model.fit(dataset)
red_dataset=transformer.transform(dataset).drop('features')

# Ejemplo de entrenamiento con Pipelines

In [11]:
from pyspark.ml.classification import LogisticRegression
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
logistic = LogisticRegression(labelCol='winner_b',featuresCol='red_features')

In [12]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.33]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 0.33]).addGrid(logistic.maxIter,[50]).build()

In [13]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from datetime import date, time, datetime
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)

Se importa el metodo now de la libreria datetime y se añade al final de cada una de las iteraciones de la validacion cruzada.

In [18]:
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[logistic.regParam], config[logistic.elasticNetParam],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo/7)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.9610477779428821 50 0.1 0.0 2023-02-27 18:22:53.948474
0.9598881584128409 50 0.33 0.0 2023-02-27 18:22:53.953825
0.8112438715150688 100 0.1 0.0 2023-02-27 18:22:53.955276
0.959392372568648 100 0.33 0.0 2023-02-27 18:22:53.955636
0.8022294736513452 50 0.1 0.33 2023-02-27 18:22:53.956042
0.7890024484969359 50 0.33 0.33 2023-02-27 18:22:53.956854
0.8022294736513452 100 0.1 0.33 2023-02-27 18:22:53.959195
0.7890024484969359 100 0.33 0.33 2023-02-27 18:22:53.959393
tiempo medio transcurrido:
0:00:00.001639
La precision del modelo es:


0.9599658725470893

Ejercicio 2. Se modifica elasticNetParam probando dos valores mas entre 0 y 1, regParam probando 3 valores mas entre 0 y 10 y maxlter probando 2 valores mas entre 1 y 100.

elasticNetParam= 0.60, regParam=5 y maxIter=90

In [41]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.60]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 5]).addGrid(logistic.maxIter,[90]).build()
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[logistic.regParam], config[logistic.elasticNetParam],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo/7)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.8103726793407865 50 0.1 0.0 2023-02-27 18:13:19.745998
0.7765723489379416 50 5.0 0.0 2023-02-27 18:13:19.746160
0.8101500165539968 100 0.1 0.0 2023-02-27 18:13:19.746571
0.7781720812788453 100 5.0 0.0 2023-02-27 18:13:19.746646
0.934033426935555 50 0.1 0.6 2023-02-27 18:13:19.746722
0.5092822923265609 50 5.0 0.6 2023-02-27 18:13:19.746818
0.7954121644770833 100 0.1 0.6 2023-02-27 18:13:19.762214
0.5092822923265609 100 5.0 0.6 2023-02-27 18:13:19.762856
tiempo medio transcurrido:
0:00:00.002470
La precision del modelo es:


0.9373325491733647

elasticNetParam= 0.20, regParam=2 y maxIter=25

In [20]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.20]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 2]).addGrid(logistic.maxIter,[25]).build()
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[logistic.regParam], config[logistic.elasticNetParam],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo/7)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.9610477779428821 50 0.1 0.0 2023-02-27 18:26:20.752849
0.8049951398950478 50 2.0 0.0 2023-02-27 18:26:20.753074
0.8112438715150688 100 0.1 0.0 2023-02-27 18:26:20.771439
0.8054914565537539 100 2.0 0.0 2023-02-27 18:26:20.771963
0.8058662820554174 50 0.1 0.2 2023-02-27 18:26:20.772292
0.5083270246274084 50 2.0 0.2 2023-02-27 18:26:20.772348
0.952629150413907 100 0.1 0.2 2023-02-27 18:26:20.772702
0.5083270246274084 100 2.0 0.2 2023-02-27 18:26:20.773336
tiempo medio transcurrido:
0:00:00.003005
La precision del modelo es:


0.9599658725470893

elasticNetParam= 0.60, regParam=1 y maxIter=10

In [21]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.60]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 1]).addGrid(logistic.maxIter,[10]).build()
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[logistic.regParam], config[logistic.elasticNetParam],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo/7)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.8111887631808984 50 0.1 0.0 2023-02-27 18:28:24.306478
0.8090410755785312 50 1.0 0.0 2023-02-27 18:28:24.306626
0.9609093725423957 100 0.1 0.0 2023-02-27 18:28:24.306716
0.8091780776988183 100 1.0 0.0 2023-02-27 18:28:24.306805
0.7965797590325698 50 0.1 0.6 2023-02-27 18:28:24.306893
0.5083270246274084 50 1.0 0.6 2023-02-27 18:28:24.316011
0.7965797590325698 100 0.1 0.6 2023-02-27 18:28:24.317384
0.5083270246274084 100 1.0 0.6 2023-02-27 18:28:24.317446
tiempo medio transcurrido:
0:00:00.001719
La precision del modelo es:


0.9608846885869922

Ejercicio3. Uso de Random Forest. Para ello se crea un pipeline cambiando el logistic por el random forest.Para ver con mayor claridad los cambios, se elige en el primer Random Forest un valor bajo, el el segundo uno medio y en el ultimo valores altos.
Se ha probado con 2 arboles y con una profundidad de 2 en el primer caso, en el siguiente se prueba con 50 arboles y 5 de profundidad y en el ultimo con 100 arboles y 10 de profundidad.

In [22]:
from pyspark.ml.classification import RandomForestClassifier
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
randomF= RandomForestClassifier(labelCol='winner_b',featuresCol='red_features')

In [23]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(randomF.numTrees, [2]).addGrid(randomF.maxDepth,[2]).build()


In [24]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, randomF])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[randomF.numTrees], config[randomF.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.8770691092968255 50 2 2 2023-02-27 18:29:39.344131
0.6450244335667823 100 2 2 2023-02-27 18:29:39.344275
tiempo medio transcurrido:
0:00:00.000402
La precision del modelo es:


0.9253867745936419

In [25]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(randomF.numTrees, [50]).addGrid(randomF.maxDepth,[5]).build()
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, randomF])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[randomF.numTrees], config[randomF.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.9376017400762685 50 50 5 2023-02-27 18:31:22.017891
0.9352532285144657 100 50 5 2023-02-27 18:31:22.018193
tiempo medio transcurrido:
0:00:00.000751
La precision del modelo es:


0.9355702069325674

In [26]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(randomF.numTrees, [100]).addGrid(randomF.maxDepth,[10]).build()
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, randomF])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[randomF.numTrees], config[randomF.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.9534432133787154 50 100 10 2023-02-27 18:35:32.622993
0.9477241081613559 100 100 10 2023-02-27 18:35:32.632938
tiempo medio transcurrido:
0:00:00.010861
La precision del modelo es:


0.9552190090736993

Ejercicio4. Uso de Gradient Boosted Tree.En el primer caso se prueba con 10 maxIer y 6 maxDepth, sin reduccion de variables PCA.
Luego se repite el proceso, reducciendo las variables PCA  se eliminan 'b_firstTower'y 'standardColumns".En el segundo caso, se prueba con 20 maxIter y 12 de maxDepth.(probe con 90 y 10 y no terminaba..)

In [28]:
from pyspark.ml.classification import GBTClassifier
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
GBT= GBTClassifier(labelCol='winner_b',featuresCol='red_features')

In [29]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(GBT.maxIter, [10]).addGrid(GBT.maxDepth,[6]).build()


In [31]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, GBT])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[GBT.maxIter],config[GBT.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.9447292616955199 50 10 6 2023-02-27 18:46:46.000559
0.9542787305589964 100 10 6 2023-02-27 18:46:46.000716
tiempo medio transcurrido:
0:00:00.000422
La precision del modelo es:


0.9542219657626466

Se eliminan de PCA las variables 'b_firstTower', 'standardColumns'

In [32]:
from pyspark.ml.classification import GBTClassifier
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
GBT= GBTClassifier(labelCol='winner_b',featuresCol='red_features')
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(GBT.maxIter, [10]).addGrid(GBT.maxDepth,[6]).build()
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, GBT])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[GBT.maxIter],config[GBT.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.8910948645136687 50 10 6 2023-02-27 18:51:46.010620
0.8913742299618225 100 10 6 2023-02-27 18:51:46.010734
tiempo medio transcurrido:
0:00:00.000345
La precision del modelo es:


0.8897647888226623

20 maxIter y 12 de maxDepth y con reduccion de PCA

In [34]:
from pyspark.ml.classification import GBTClassifier
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
GBT= GBTClassifier(labelCol='winner_b',featuresCol='red_features')
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(GBT.maxIter, [20]).addGrid(GBT.maxDepth,[12]).build()
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, GBT])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)
evaluator.evaluate(model.transform(test_df))
from datetime import date, time, datetime
inicio=datetime.now()
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[GBT.maxIter],config[GBT.maxDepth],datetime.now())
fin=datetime.now()
tiempo=fin-inicio
print("tiempo medio transcurrido:")
print(tiempo)
print("La precision del modelo es:")
evaluator.evaluate(model.transform(test_df))

0.8722374558296935 50 20 12 2023-02-27 19:49:58.107114
0.8681433060151047 100 20 12 2023-02-27 19:49:58.107247
tiempo medio transcurrido:
0:00:00.001093
La precision del modelo es:


0.8740731104462078

Para el calculo del tiempo se usa datetime.now al principio de bucle for y cuando termina dicho bucle. Esto añade algo mas de tiempo que el tiempo real pero como se calcula en todos los procesos por igual, el valor añadido es el mismo.

Logistica
elasticNetParam= 0.60, regParam=5 y maxIter=90
tiempo medio transcurrido:
0:00:00.002470
La precision del modelo es:
0.9373325491733647

elasticNetParam= 0.20, regParam=2 y maxIter=25
tiempo medio transcurrido:
0:00:00.003005
La precision del modelo es:
0.9599658725470893

elasticNetParam= 0.60, regParam=1 y maxIter=10--
tiempo medio transcurrido:
0:00:00.001719
La precision del modelo es:
0.9608846885869922

Random Forest
numTress=2 y maxDepth=2
tiempo medio transcurrido:
0:00:00.000402
La precision del modelo es:
0.9253867745936419

numTress=50y maxDepth=5
tiempo medio transcurrido:
0:00:00.000751
La precision del modelo es:
0.9355702069325674

numTress=100 maxDepth=10
tiempo medio transcurrido:
0:00:00.010861
La precision del modelo es:
0.9552190090736993


Gradient Boosted Tree
10 maxIer y 2 maxDepth sin reduccion--
tiempo medio transcurrido:
0:00:00.000422
La precision del modelo es:
0.9542219657626466

10 maxIer y 2 maxDepth con reduccion--
tiempo medio transcurrido:
0:00:00.000345
La precision del modelo es:
0.8897647888226623

20 maxIer y 12maxDepth con reduccion--
tiempo medio transcurrido:
0:00:00.001093
La precision del modelo es:
0.8740731104462078


Conclusiones:
La cinfiguracion mas existosa ha sido la logistica con los siguientes parametros elasticNetParam= 0.60, regParam=1 y maxIter=10, alcanzando una precision de 96%.
La que mas ha tardado ha sido el Random Forest con numTress=100 maxDepth=10 dandonos una precision de 95,5%.
Si se eliminan variables en la PCA puede ocurrir dos casos:
1- que esas variables no tengan mucha correlacion con nuestra variable objetivo y apenas influya en la precision.
2- que tengan mas peso y que nos influya
En el Gradient Boosted Tree se ejecuto la misma configuracion, 10 maxIter y 2 maxDepth, uno con todas las variables y en el otro eliminando 2 variables.
En este caso nos ha bajado de un 95,4% a 88,97% la precision si bien el tiempo se ha reducido,no merece la pena ya que el porcentaje ha bajado demasiado, quizas con una reduccion de tiempo mayor cabria la oportunidad de pensarlo.
Cabe destacar que una cosa es el tiempo de entrenamiento entre rondas, y otro es el tiempo de entrenar el modelo desde que se inicia, siendo los tiempos de entrenamiento cuando la profundidad y numero de arboles es muy alto muy superior al resto.