In [88]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, FloatType
from pyspark.sql.functions import col, udf, when

import numpy as np

In [89]:
def convert_string_to_float(x):
    x_replace_minus = x.replace(u'\u2212', '-')
    if x_replace_minus == '-':
        return np.nan
    else:
        return float(x_replace_minus)

udf_convert_string_to_float = udf(lambda x: convert_string_to_float(x), FloatType())

## Dataset: Teams

In [84]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("country", StringType(), True)])
teams = spark.read.csv("../data/AFC/en.teams.tsv", sep="\t", header=False)

In [85]:
print("Dataset teams format: {0}".format(teams.dtypes))
print("Number of country: {0}".format(teams.count()))
# Check if some countries are identified with the same id
# print("Country with similar id string:")
# teams.groupBy("id").count().filter(col("count") != 1).show()
print("Preview of the dataset:")
teams.rdd.first()


Dataset teams format: [('_c0', 'string'), ('_c1', 'string')]
Number of country: 313
Preview of the dataset:


Row(_c0=u'AN', _c1=u'Aden')

In [95]:
schema = StructType([
    StructField("rankGroup_local", StringType(), True),
    StructField("rankGroup_global", StringType(), True),
    StructField("teamGroup_team", StringType(), True),
    StructField("ratingGroup_rating", StringType(), True),
    StructField("highestGroup_rank_max", StringType(), True),
    StructField("highestGroup_rating_max", StringType(), True),
    StructField("averageGroup_rank_avg", StringType(), True),
    StructField("averageGroup_rating_avg", StringType(), True),
    StructField("lowestGroup_rank_min", StringType(), True),
    StructField("lowestGroup_rating_min", StringType(), True),
    StructField("change3mGroup_rank_three_month_change", StringType(), True),
    StructField("change3mGroup_rating_three_month_change", StringType(), True),
    StructField("change6mGroup_rank_six_month_change", StringType(), True),
    StructField("change6mGroup_rating_six_month_change", StringType(), True),
    StructField("change1yGroup_rank_one_year_change", StringType(), True),
    StructField("change1yGroup_rating_one_year_change", StringType(), True),
    StructField("change2yGroup_rank_two_year_change", StringType(), True),
    StructField("change2yGroup_rating_two_year_change", StringType(), True),
    StructField("change5yGroup_rank_five_year_change", StringType(), True),
    StructField("change5yGroup_rating_five_year_change", StringType(), True),
    StructField("change10yGroup_rank_ten_year_change", StringType(), True),
    StructField("change10yGroup_rating_ten_year_change", StringType(), True),
    StructField("matchesGroup_total", StringType(), True),
    StructField("matchesGroup_home", StringType(), True),
    StructField("matchesGroup_away", StringType(), True),
    StructField("matchesGroup_neutral", StringType(), True),
    StructField("matchesGroup_wins", StringType(), True),
    StructField("matchesGroup_losses", StringType(), True),
    StructField("matchesGroup_draws", StringType(), True),
    StructField("goalsGroup_for", StringType(), True),
    StructField("goalsGroup_against", StringType(), True)
])

names_to_convert = schema.names
names_to_convert.remove("teamGroup_team")


AFC_qualifying_start = spark.read.csv("../data/AFC/2014_World_Cup_AFC_qualifying_start.tsv", sep="\t", 
                                      schema=schema, header=False)\
                                 .select([udf_convert_string_to_float(col(name)).alias(name) for name in names_to_convert] + ["teamGroup_team"])

In [99]:
AFC_qualifying_start.count()

43