# Ejemplo de transformación, limpieza y procesado de conjuntos de datos

In [2]:
games_df = spark.read.csv('../data/games.csv', header=True, sep=',', inferSchema=True)
games_red_df = games_df.selectExpr('winner',
                                   'gameDuration as duration',
                                   'firstBlood',
                                   'firstTower',
                                   'firstInhibitor',
                                   'firstBaron',
                                   'firstDragon',
                                   'firstRiftHerald',
                                   't1_champ1id',
                                   't1_champ2id',
                                   't1_champ3id',
                                   't1_champ4id',
                                   't1_champ5id',
                                   't1_towerKills',
                                   't1_inhibitorKills',
                                   't1_baronKills',
                                   't1_dragonKills',
                                   't1_riftHeraldKills',
                                   't2_champ1id',
                                   't2_champ2id',
                                   't2_champ3id',
                                   't2_champ4id',
                                   't2_champ5id',
                                   't2_towerKills',
                                   't2_inhibitorKills',
                                   't2_baronKills',
                                   't2_dragonKills',
                                   't2_riftHeraldKills')

In [3]:
from pyspark.sql.functions import expr
from pyspark.ml.feature import CountVectorizer
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
games_red_cv_df = games_red_df.withColumn('t1_members_str',new_column_expression).drop('t1_champ1id', 't1_champ2id', 't1_champ3id', 't1_champ4id', 't1_champ5id')
cv = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
model = cv.fit(games_red_cv_df)
games_red_cv_df=model.transform(games_red_cv_df).drop('t1_members_str')

In [4]:
from pyspark.sql.functions import expr
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
games_red_cv_df = games_red_cv_df.withColumn('t2_members_str',new_column_expression).drop('t2_champ1id', 't2_champ2id', 't2_champ3id', 't2_champ4id', 't2_champ5id')
cv = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
model = cv.fit(games_red_cv_df)
games_red_cv_df=model.transform(games_red_cv_df).drop('t2_members_str')

In [5]:
from pyspark.ml.feature import Binarizer
from pyspark.sql.types import DoubleType
games_red_cv_df = games_red_cv_df.withColumn('winner', games_red_cv_df.winner.cast(DoubleType()))
transformer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
games_red_cv_df=transformer.transform(games_red_cv_df).drop('winner')

In [6]:
from pyspark.ml.feature import OneHotEncoderEstimator
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
model = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
transformer=model.fit(games_red_cv_df)
games_red_cv_df=transformer.transform(games_red_cv_df)
for column in columns :
    games_red_cv_df = games_red_cv_df.drop(column)

In [7]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
games_red_cv_df=assembler.transform(games_red_cv_df)
model = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
transformer = model.fit(games_red_cv_df)
games_red_cv_df=transformer.transform(games_red_cv_df)

In [8]:
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler = VectorAssembler(inputCols=columns, outputCol='features')
games_red_cv_df = assembler.transform(games_red_cv_df)
dataset = games_red_cv_df.selectExpr('winner_b as label', 'features')

In [9]:
from pyspark.ml.feature import PCA
model = PCA(inputCol='features', outputCol='red_features',k=50)
transformer = model.fit(dataset)
red_dataset=transformer.transform(dataset).drop('features')

# Ejemplo de entrenamiento con Pipelines

In [10]:
from pyspark.ml.classification import LogisticRegression
[train_df, test_df]=games_red_df.randomSplit([0.7, 0.3])
train_df = train_df.withColumn('winner', train_df.winner.cast(DoubleType()))
test_df = test_df.withColumn('winner', test_df.winner.cast(DoubleType()))
new_column_expression = expr('split( concat_ws( "," ,t1_champ1id, t1_champ2id, t1_champ3id, t1_champ4id, t1_champ5id), "," )' )
train_df = train_df.withColumn('t1_members_str',new_column_expression)
test_df = test_df.withColumn('t1_members_str', new_column_expression)
new_column_expression = expr('split( concat_ws( "," ,t2_champ1id, t2_champ2id, t2_champ3id, t2_champ4id, t2_champ5id), "," )' )
train_df = train_df.withColumn('t2_members_str',new_column_expression)
test_df = test_df.withColumn('t2_members_str', new_column_expression)
cv1 = CountVectorizer(inputCol='t1_members_str', outputCol='t1_members')
cv2 = CountVectorizer(inputCol='t2_members_str', outputCol='t2_members')
binarizer=Binarizer(inputCol='winner', outputCol='winner_b', threshold=1)
columns = ['firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron', 'firstDragon', 'firstRiftHerald']
new_columns = [ 'b_firstBlood', 'b_firstTower', 'b_firstInhibitor', 'b_firstBaron', 'b_firstDragon', 'b_firstRiftHerald' ]
ohe = OneHotEncoderEstimator(inputCols=columns, outputCols=new_columns,dropLast=False)
columns = ["duration", "t1_towerKills", "t1_inhibitorKills", "t1_baronKills", "t1_dragonKills", "t1_riftHeraldKills",
          "t2_towerKills", "t2_inhibitorKills", "t2_baronKills", "t2_dragonKills", "t2_riftHeraldKills"]
assembler1 = VectorAssembler(inputCols=columns, outputCol="assembledColumns")
scaler = StandardScaler(inputCol="assembledColumns", outputCol="standardColumns", withStd=True, withMean=True)
columns = ['t1_members', 't2_members', 'b_firstBlood', 'b_firstBaron', 'b_firstDragon', 'b_firstInhibitor', 'b_firstRiftHerald', 'b_firstTower', 'standardColumns']
assembler2 = VectorAssembler(inputCols=columns, outputCol='features')
pca = PCA(inputCol='features', outputCol='red_features')
logistic = LogisticRegression(labelCol='winner_b',featuresCol='red_features')

In [11]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder().addGrid(logistic.elasticNetParam, [0, 0.33]).\
    addGrid(pca.k, [50, 100]).addGrid(logistic.regParam, [0.1, 0.33]).addGrid(logistic.maxIter,[50]).build()

In [12]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', predictionCol='prediction', labelCol='winner_b')
pipeline = Pipeline().setStages([cv1, cv2, binarizer, ohe, assembler1, scaler, assembler2, pca, logistic])
cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(params).setNumFolds(3)
model = cv.fit(train_df)

In [13]:
for (result, config) in zip(model.avgMetrics, params) :
    print(result, config[pca.k], config[logistic.regParam], config[logistic.elasticNetParam])

0.9609312382568909 50 0.1 0.0
0.9600947005642335 50 0.33 0.0
0.9615464461090166 100 0.1 0.0
0.959758767703472 100 0.33 0.0
0.9471420499615943 50 0.1 0.33
0.9278745831433264 50 0.33 0.33
0.9471420499615943 100 0.1 0.33
0.9278745831433264 100 0.33 0.33


In [15]:
evaluator.evaluate(model.transform(test_df))

0.960321510589436