In [3]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import udf, col

from itertools import combinations

In [47]:
class WorldCupFirstRound:
    def __init__(self, spark_session, model_classifier):
        self.spark = spark_session
        self.classification_model = model_classifier
    
    def __str__(self):
        pass
    
    def run(self):
        self.load_data_teams()
        self.define_df_matches_by_group()
    
    def load_data_groups(self):
        schema = StructType([
            StructField("group", StringType(), True),
            StructField("country_1", StringType(), True),
            StructField("country_2", StringType(), True),
            StructField("country_3", StringType(), True),
            StructField("country_4", StringType(), True)])
        return self.spark.read.csv("../data/groups.csv", sep=",", schema=schema, header=False)

    def define_list_matches_by_group(self):
        udf_define_matches = udf(lambda x,y,z,t: list(combinations([x, y, z, t], 2)), ArrayType(ArrayType(StringType())))
        udf_matches_group = udf(lambda group, matches: [[group] + match for match in matches], ArrayType(ArrayType(StringType())))
        all_matches = self.load_data_groups()\
        .withColumn("matches", udf_define_matches(col("country_1"), col("country_2"), col("country_3"), col("country_4")))\
        .withColumn("group_matches", udf_matches_group(col("group"), col("matches")))\
        .select("group_matches").rdd.map(lambda x: x["group_matches"])\
        .collect()
        return [y for x in all_matches for y in x]
    
    def define_df_matches_by_group(self):
        udf_trim = udf(lambda x: x.strip(), StringType())
        list_matches_by_group = self.define_list_matches_by_group()
        schema = schema = StructType([StructField("group", StringType(), True), 
                                      StructField("country_1", StringType(), True),
                                      StructField("country_2", StringType(), True)])
        print(self.teams.rdd.first())
        return (self.spark.createDataFrame(list_matches_by_group, schema=schema)
                .select(col("group"), udf_trim(col("country_1")).alias("country_1"), 
                                      udf_trim(col("country_2")).alias("country_2"))
                .join(self.teams, col("country_1") == col("country"))
                .drop("country").drop("country_1").withColumnRenamed("team", "team_1")
                .join(self.teams, col("country_2") == col("country"))
                .drop("country").drop("country_2").withColumnRenamed("team", "team_2"))
    
    def load_data_teams(self):
        schema = StructType([
            StructField("team", StringType(), True),
            StructField("country", StringType(), True)])
        self.teams = spark.read.csv("../data/common/en.teams.tsv", sep="\t", header=False, schema=schema)
    

In [48]:
world_cup_1st_round = WorldCupFirstRound(spark, None)
world_cup_1st_round.run()

Row(team=u'AN', country=u'Aden')
+-----+------+------+
|group|team_1|team_2|
+-----+------+------+
|    A|    BR|    HR|
|    A|    BR|    MX|
|    A|    BR|    CM|
|    A|    HR|    MX|
|    A|    HR|    CM|
+-----+------+------+
only showing top 5 rows

None


In [21]:
world_cup_1st_round.define_list_matches_by_group()

[[u'A', u' Brazil', u' Croatia'],
 [u'A', u' Brazil', u' Mexico'],
 [u'A', u' Brazil', u' Cameroon'],
 [u'A', u' Croatia', u' Mexico'],
 [u'A', u' Croatia', u' Cameroon'],
 [u'A', u' Mexico', u' Cameroon'],
 [u'B', u' Spain', u' Holland'],
 [u'B', u' Spain', u' Chile'],
 [u'B', u' Spain', u' Australia'],
 [u'B', u' Holland', u' Chile'],
 [u'B', u' Holland', u' Australia'],
 [u'B', u' Chile', u' Australia'],
 [u'C', u' Colombia', u' Greece'],
 [u'C', u' Colombia', u' Ivory Coast'],
 [u'C', u' Colombia', u' Japan'],
 [u'C', u' Greece', u' Ivory Coast'],
 [u'C', u' Greece', u' Japan'],
 [u'C', u' Ivory Coast', u' Japan'],
 [u'D', u' Uruguay', u' Costa Rica'],
 [u'D', u' Uruguay', u' England'],
 [u'D', u' Uruguay', u' Italy'],
 [u'D', u' Costa Rica', u' England'],
 [u'D', u' Costa Rica', u' Italy'],
 [u'D', u' England', u' Italy'],
 [u'E', u' Switzerland', u' Ecuador'],
 [u'E', u' Switzerland', u' France'],
 [u'E', u' Switzerland', u' Honduras'],
 [u'E', u' Ecuador', u' France'],
 [u'E', u

In [22]:
udf_define_matches = udf(lambda x,y,z,t: list(combinations([x, y, z, t], 2)), ArrayType(ArrayType(StringType())))

In [4]:
data = data.withColumn("matches", udf_define_matches(col("team_1"), col("team_2"), col("team_3"), col("team_4")))

In [5]:
udf_matches_group = udf(lambda group, matches: [[group] + match for match in matches], ArrayType(ArrayType(StringType())))


all_matches = data\
.withColumn("group_matches", udf_matches_group(col("group"), col("matches")))\
.select("group_matches").rdd.map(lambda x: x["group_matches"]).collect()

matches_flattened_list = [y for x in all_matches for y in x]

In [6]:
schema = schema = StructType([
    StructField("group", StringType(), True), 
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True),
])

matches = spark.createDataFrame(matches_flattened_list, schema=schema)

matches.show(5)

+-----+--------+---------+
|group|  team_1|   team_2|
+-----+--------+---------+
|    A|  Brazil|  Croatia|
|    A|  Brazil|   Mexico|
|    A|  Brazil| Cameroon|
|    A| Croatia|   Mexico|
|    A| Croatia| Cameroon|
+-----+--------+---------+
only showing top 5 rows



In [7]:
#matches.coalesce(1).write.csv("../data/first_round_matches")

## First round define all matches

In [13]:
schema = StructType([
    StructField("group", StringType(), True),
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True)])
all_matches = spark.read.csv("../data/first_round_matches/", sep=",", header=False, schema=schema)
all_matches.filter(col("group") == "A").show()
all_matches.filter(col("team_1") == "Brazil").show()
all_matches.filter(col("team_2") == "Brazil").show()

+-----+-------+--------+
|group| team_1|  team_2|
+-----+-------+--------+
|    A| Brazil| Croatia|
|    A| Brazil|  Mexico|
|    A| Brazil|Cameroon|
|    A|Croatia|  Mexico|
|    A|Croatia|Cameroon|
|    A| Mexico|Cameroon|
+-----+-------+--------+

+-----+------+--------+
|group|team_1|  team_2|
+-----+------+--------+
|    A|Brazil| Croatia|
|    A|Brazil|  Mexico|
|    A|Brazil|Cameroon|
+-----+------+--------+

+-----+------+------+
|group|team_1|team_2|
+-----+------+------+
+-----+------+------+



In [14]:
groups = all_matches.select("group").distinct().rdd.map(lambda x: x["group"]).collect()
print("Groups: {0}".format(groups))

Groups: [u'F', u'E', u'B', u'D', u'C', u'A', u'G', u'H']


In [18]:
dic_matches = {group:{} for group in groups}
rdd_matches = all_matches.rdd.map(lambda x: (x["group"], x["team_1"], x["team_2"])).collect()
for group in dic_matches.keys():
    group_matches = filter(lambda x: x[0] == group, rdd_matches)
    for matches in group_matches:
        team_1 = matches[1]
        team_2 = matches[2]
        dic_matches[group][team_1+'/'+team_2] = None

In [20]:
dic_matches["A"]

{u'Brazil/Cameroon': None,
 u'Brazil/Croatia': None,
 u'Brazil/Mexico': None,
 u'Croatia/Cameroon': None,
 u'Croatia/Mexico': None,
 u'Mexico/Cameroon': None}