In [1]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import udf, col

from itertools import combinations

In [2]:
schema = StructType([
    StructField("group", StringType(), True),
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True),
    StructField("team_3", StringType(), True),
    StructField("team_4", StringType(), True),
])

data = spark.read.csv("../data/groups.csv", sep=",", schema=schema, header=False)
data.show()

+-----+------------+-------------------+------------+------------+
|group|      team_1|             team_2|      team_3|      team_4|
+-----+------------+-------------------+------------+------------+
|    A|      Brazil|            Croatia|      Mexico|    Cameroon|
|    B|       Spain|            Holland|       Chile|   Australia|
|    C|    Colombia|             Greece| Ivory Coast|       Japan|
|    D|     Uruguay|         Costa Rica|     England|       Italy|
|    E| Switzerland|            Ecuador|      France|    Honduras|
|    F|   Argentina| Bosnia-Herzegovina|        Iran|     Nigeria|
|    G|     Germany|           Portugal|       Ghana|         USA|
|    H|     Belgium|            Algeria|      Russia| South Korea|
+-----+------------+-------------------+------------+------------+



In [3]:
udf_define_matches = udf(lambda x,y,z,t: list(combinations([x, y, z, t], 2)), ArrayType(ArrayType(StringType())))

In [4]:
data = data.withColumn("matches", udf_define_matches(col("team_1"), col("team_2"), col("team_3"), col("team_4")))

In [5]:
udf_matches_group = udf(lambda group, matches: [[group] + match for match in matches], ArrayType(ArrayType(StringType())))


all_matches = data\
.withColumn("group_matches", udf_matches_group(col("group"), col("matches")))\
.select("group_matches").rdd.map(lambda x: x["group_matches"]).collect()

matches_flattened_list = [y for x in all_matches for y in x]

In [6]:
schema = schema = StructType([
    StructField("group", StringType(), True), 
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True),
])

matches = spark.createDataFrame(matches_flattened_list, schema=schema)

matches.show(5)

+-----+--------+---------+
|group|  team_1|   team_2|
+-----+--------+---------+
|    A|  Brazil|  Croatia|
|    A|  Brazil|   Mexico|
|    A|  Brazil| Cameroon|
|    A| Croatia|   Mexico|
|    A| Croatia| Cameroon|
+-----+--------+---------+
only showing top 5 rows



In [7]:
#matches.coalesce(1).write.csv("../data/first_round_matches")

## First round define all matches

In [13]:
schema = StructType([
    StructField("group", StringType(), True),
    StructField("team_1", StringType(), True),
    StructField("team_2", StringType(), True)])
all_matches = spark.read.csv("../data/first_round_matches/", sep=",", header=False, schema=schema)
all_matches.filter(col("group") == "A").show()
all_matches.filter(col("team_1") == "Brazil").show()
all_matches.filter(col("team_2") == "Brazil").show()

+-----+-------+--------+
|group| team_1|  team_2|
+-----+-------+--------+
|    A| Brazil| Croatia|
|    A| Brazil|  Mexico|
|    A| Brazil|Cameroon|
|    A|Croatia|  Mexico|
|    A|Croatia|Cameroon|
|    A| Mexico|Cameroon|
+-----+-------+--------+

+-----+------+--------+
|group|team_1|  team_2|
+-----+------+--------+
|    A|Brazil| Croatia|
|    A|Brazil|  Mexico|
|    A|Brazil|Cameroon|
+-----+------+--------+

+-----+------+------+
|group|team_1|team_2|
+-----+------+------+
+-----+------+------+



In [14]:
groups = all_matches.select("group").distinct().rdd.map(lambda x: x["group"]).collect()
print("Groups: {0}".format(groups))

Groups: [u'F', u'E', u'B', u'D', u'C', u'A', u'G', u'H']


In [18]:
dic_matches = {group:{} for group in groups}
rdd_matches = all_matches.rdd.map(lambda x: (x["group"], x["team_1"], x["team_2"])).collect()
for group in dic_matches.keys():
    group_matches = filter(lambda x: x[0] == group, rdd_matches)
    for matches in group_matches:
        team_1 = matches[1]
        team_2 = matches[2]
        dic_matches[group][team_1+'/'+team_2] = None

In [20]:
dic_matches["A"]

{u'Brazil/Cameroon': None,
 u'Brazil/Croatia': None,
 u'Brazil/Mexico': None,
 u'Croatia/Cameroon': None,
 u'Croatia/Mexico': None,
 u'Mexico/Cameroon': None}