In [1]:
from pyspark.sql.functions import expr
from pyspark.ml.feature import *
from pyspark.sql.types import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from datetime import datetime

Leemos el csv y nos quedamos con las columnas que nos interesan. Además, hacemos las transformaciones de tipos necesarias

In [2]:
games_df = spark.read.csv('../data/games.csv', header=True, sep=',')

games_red_df = games_df.selectExpr('winner', 'gameDuration as duration', 'firstBlood',
                                   'firstTower', 'firstInhibitor', 'firstBaron',
                                   'firstDragon', 'firstRiftHerald', 't1_champ1id',
                                   't1_champ2id', 't1_champ3id', 't1_champ4id',
                                   't1_champ5id','t1_towerKills', 't1_inhibitorKills',
                                   't1_baronKills', 't1_dragonKills',
                                   't1_riftHeraldKills',
                                   't2_champ1id', 't2_champ2id', 't2_champ3id',
                                   't2_champ4id', 't2_champ5id', 't2_towerKills',
                                   't2_inhibitorKills', 't2_baronKills',
                                   't2_dragonKills', 't2_riftHeraldKills')

games_red_df = games_red_df.withColumn('firstBlood', games_red_df.firstBlood.cast(IntegerType()))
games_red_df = games_red_df.withColumn('firstTower', games_red_df.firstTower.cast(IntegerType()))
games_red_df = games_red_df.withColumn('firstInhibitor', games_red_df.firstInhibitor.cast(IntegerType()))
games_red_df = games_red_df.withColumn('firstBaron', games_red_df.firstBaron.cast(IntegerType()))
games_red_df = games_red_df.withColumn('firstDragon', games_red_df.firstDragon.cast(IntegerType()))
games_red_df = games_red_df.withColumn('firstRiftHerald', games_red_df.firstRiftHerald.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t1_towerKills', games_red_df.t1_towerKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t1_inhibitorKills', games_red_df.t1_inhibitorKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t1_baronKills', games_red_df.t1_baronKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t1_dragonKills', games_red_df.t1_dragonKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t1_riftHeraldKills', games_red_df.t1_riftHeraldKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t2_towerKills', games_red_df.t2_towerKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t2_inhibitorKills', games_red_df.t2_inhibitorKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t2_baronKills', games_red_df.t2_baronKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t2_dragonKills', games_red_df.t2_dragonKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('t2_riftHeraldKills', games_red_df.t2_riftHeraldKills.cast(IntegerType()))
games_red_df = games_red_df.withColumn('duration', games_red_df.duration.cast(IntegerType()))
games_red_df = games_red_df.withColumn('winner', games_red_df.winner.cast(DoubleType()))

Separamos el dataframe en dos: uno de entrenamiento y uno de test

In [3]:
[train_df, test_df] = games_red_df.randomSplit([0.7, 0.3])

Agrupamos los personajes de cada equipo en un array

In [4]:
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)

Contamos el número de veces que aparecen los personajes usados en cada equipo

In [5]:
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')

Hacemos que la variable winner sea binaria: 0 si gana el equipo 1, y 1 si gana el equipo 2. Esto lo hacemos a través de un Binarizer

In [6]:
binarizer = Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)

Agrupamos varias columnas y las transformamos en variables binarias para no tratarlas como variables categóricas

In [7]:
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = ['b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)

Unimos varias columnas con VectorAssembler y las escalamos con StandardScaler restándole la media a los valores y dividiéndolos por su desviación típica para trabajar con valores en la misma magnitud

In [8]:
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)

Unimos varias columnas con vector assembler para utilizarlas para el PCA

In [9]:
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')

Ejercicio 1

In [10]:
logistic = LogisticRegression(labelCol='winner_b',featuresCol='red_features')
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])

params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.33]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 0.33]).addGrid(logistic.maxIter,[50]).build()

fromDate = datetime.now()

model = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).\
    setNumFolds(5).fit(train_df)

print((datetime.now() - fromDate).total_seconds())

print("------------------------------------------------------------------------------------------------")

for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[logistic.elasticNetParam], config[pca.k], config[logistic.regParam], config[logistic.maxIter])

print("------------------------------------------------------------------------------------------------")
    
print(evaluator.evaluate(model.transform(test_df)))

105.021157
------------------------------------------------------------------------------------------------
0.9600710814343387 0.0 50 0.1 50
0.9598457208408941 0.0 50 0.33 50
0.960315063044797 0.0 100 0.1 50
0.9595941274250777 0.0 100 0.33 50
0.9478012318997993 0.33 50 0.1 50
0.9274605728346121 0.33 50 0.33 50
0.9478012318997993 0.33 100 0.1 50
0.9274605728346121 0.33 100 0.33 50
------------------------------------------------------------------------------------------------
0.9610051428943428


Ejercicio 2

In [11]:
logistic = LogisticRegression(labelCol='winner_b',featuresCol='red_features')
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])

params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.33, 0.5, 0.66]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 0.33, 1, 5, 7]).addGrid(logistic.maxIter,[50, 10, 90]).build()

fromDate = datetime.now()

model = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).\
    setNumFolds(5).fit(train_df)

print((datetime.now() - fromDate).total_seconds())

print("------------------------------------------------------------------------------------------------")

for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[logistic.elasticNetParam], config[pca.k], config[logistic.regParam], config[logistic.maxIter])
    
print("------------------------------------------------------------------------------------------------")

print(evaluator.evaluate(model.transform(test_df)))

1133.995956
------------------------------------------------------------------------------------------------
0.9600710814343387 0.0 50 0.1 50
0.9600710814343387 0.0 50 0.1 10
0.9600710814343387 0.0 50 0.1 90
0.9598457208408941 0.0 50 0.33 50
0.9598457208408941 0.0 50 0.33 10
0.9598457208408941 0.0 50 0.33 90
0.9585443486810321 0.0 50 1.0 50
0.9585443486810321 0.0 50 1.0 10
0.9585443486810321 0.0 50 1.0 90
0.9327472088085138 0.0 50 5.0 50
0.9327472088085138 0.0 50 5.0 10
0.9327472088085138 0.0 50 5.0 90
0.9081000232112197 0.0 50 7.0 50
0.9081000232112197 0.0 50 7.0 10
0.9081000232112197 0.0 50 7.0 90
0.960315063044797 0.0 100 0.1 50
0.960288147510727 0.0 100 0.1 10
0.960315063044797 0.0 100 0.1 90
0.9595941274250777 0.0 100 0.33 50
0.9595941274250777 0.0 100 0.33 10
0.9595941274250777 0.0 100 0.33 90
0.958205713181232 0.0 100 1.0 50
0.958205713181232 0.0 100 1.0 10
0.958205713181232 0.0 100 1.0 90
0.9347965509684086 0.0 100 5.0 50
0.9347965509684086 0.0 100 5.0 10
0.9347965509684086 0.0

Ejercicio 3

In [12]:
rf = RandomForestClassifier(labelCol='winner_b', featuresCol='red_features')
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, rf])

params = ParamGridBuilder().addGrid(pca.k, [50, 100]).addGrid(rf.numTrees, [25, 50, 75]).\
    addGrid(rf.maxDepth, [2, 5, 7]).build()

fromDate = datetime.now()

model = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).\
    setNumFolds(5).fit(train_df)

print((datetime.now() - fromDate).total_seconds())

print("------------------------------------------------------------------------------------------------")

for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[rf.numTrees], config[rf.maxDepth])
    
print("------------------------------------------------------------------------------------------------")

print(evaluator.evaluate(model.transform(test_df)))

397.445755
------------------------------------------------------------------------------------------------
0.930166502837336 50 25 2
0.9379770833152193 50 25 5
0.943977836487595 50 25 7
0.9304408465326807 50 50 2
0.9379218780408931 50 50 5
0.9438189825703925 50 50 7
0.9311901454763646 50 75 2
0.9363632139734741 50 75 5
0.9437593630817676 50 75 7
0.931577214062606 100 25 2
0.9367884567970133 100 25 5
0.9414636073240044 100 25 7
0.933077010244555 100 50 2
0.936096557046909 100 50 5
0.9405178125824019 100 50 7
0.9329109503872683 100 75 2
0.9362279919034004 100 75 5
0.9416572581722863 100 75 7
------------------------------------------------------------------------------------------------
0.9440791615129224


Ejercicio 4

In [14]:
gbt = GBTClassifier(labelCol='winner_b',featuresCol='features')
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, gbt])

params = ParamGridBuilder().addGrid(gbt.maxIter,[20, 80]).addGrid(gbt.maxDepth,[3, 7]).build()

fromDate = datetime.now()

model = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).\
    setNumFolds(5).fit(train_df)

print((datetime.now() - fromDate).total_seconds())

print("------------------------------------------------------------------------------------------------")

for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[gbt.maxIter], config[gbt.maxDepth])
    
print("------------------------------------------------------------------------------------------------")

print(evaluator.evaluate(model.transform(test_df)))

931.855396
------------------------------------------------------------------------------------------------
0.963057771519281 20 3
0.9686979292718068 20 7
0.9689484576003737 80 3
0.9690834125559534 80 7
------------------------------------------------------------------------------------------------
0.9686869344443721


Ejercicio 5

Observamos que, por lo general, los resultados del modelo Gradient-boosted tree son muy buenos. No obstante, con otros modelos también se han obtenido resultados muy buenos. Tenemos que, de media, el entrenamiento por modelo dura aproximadamente:
- Regresión logística: 9.7 segundos
- Random Forest: 22.08 segundos
- Gradient-boosted tree: 3.88 minutos

Vemos que utilizar un modelo Gradient-boosted tree es muy costoso, aunque dentro de lo que cabe es asumible. Si queremos una opción menos costosa en cuanto a tiempo, quizás no tan precisa pero mucho más rápida, podríamos usar una regresión logística. con una configuración en la que se utilice elasticNetParam = 0, PCA = 100, regParam = 0.1, y unas iteraciones que oscilen entre 50 y 90. Teniendo en cuenta esta información, intentamos obtener el modelo con más precisión:

In [24]:
logistic = LogisticRegression(labelCol='winner_b',featuresCol='red_features')
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])

params = ParamGridBuilder().addGrid(logistic.elasticNetParam,[0]).\
    addGrid(pca.k,[75]).addGrid(logistic.regParam,[0.006]).addGrid(logistic.maxIter,[10]).build()

fromDate = datetime.now()

model = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).\
    setNumFolds(5).fit(train_df)

print((datetime.now() - fromDate).total_seconds())

print("------------------------------------------------------------------------------------------------")

for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[logistic.elasticNetParam], config[pca.k], config[logistic.regParam], config[logistic.maxIter])
    
print("------------------------------------------------------------------------------------------------")

print(evaluator.evaluate(model.transform(test_df)))

21.042631
------------------------------------------------------------------------------------------------
0.9626194609952959 0.0 75 0.006 10
------------------------------------------------------------------------------------------------
0.9618514419634139


In [None]:
0.9624818573602166 0.0 75 0.008 25
0.9624818573602166 0.0 75 0.008 40
0.9624818573602166 0.0 75 0.008 55

0.9620467417485841