In [33]:
# Imports
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, concat, col, lit, array, udf, dense_rank, concat_ws, first
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, DateType, TimestampType
from pyspark.sql import Window as W

In [34]:
# Initialize and configure Spark
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("FixturePipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
spark = SparkSession.builder \
        .config(conf=sparkConf).getOrCreate()

In [35]:
# Setup abstract Google storage FileSystem
bucket_id = "data_de2022_ng"
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set('temporaryGcsBucket', bucket_id)
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

In [36]:
# # Load data into DataFrame
# google_cloud_storage_path = 'gs://data_de2022_ng/'

# df = spark.read.format("json") \
#                          .option("inferSchema", "true") \
#                          .option("multiLine", "true") \
#                          .load(f'{google_cloud_storage_path}statistics.json')

In [43]:
df = spark.read.format("json") \
                         .option("inferSchema", "true") \
                         .load('/home/jovyan/data/statistics.json')

df.printSchema()

root
 |-- fixture_id: string (nullable = true)
 |-- statistics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- team: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- logo: string (nullable = true)
 |    |-- name: string (nullable = true)



In [44]:
statistics_table = df.select('fixture_id',
                                   col('team.id').alias('team_id'),
                                   explode('statistics').alias('statistics')) \
                                   .withColumn('fixture_team_id', concat_ws('','fixture_id','team_id')) \
                                   .withColumn('type', col('statistics').getItem('type')) \
                                   .withColumn('value', col('statistics').getItem('value')).groupby('fixture_team_id').pivot('type').agg(first('value')) \
                                   .withColumn('fixture_id', split('fixture_team_id', '').getItem(0)) \
                                   .withColumn('team_id', split('fixture_team_id', '_').getItem(1))


statistics_table.show()

+---------------+---------------+-------------+------------+-----+----------------+--------+--------+---------------+---------+---------------+--------------+-------------+----------------+-----------+------------+------------+----------+-------+
|fixture_team_id|Ball Possession|Blocked Shots|Corner Kicks|Fouls|Goalkeeper Saves|Offsides|Passes %|Passes accurate|Red Cards|Shots insidebox|Shots off Goal|Shots on Goal|Shots outsidebox|Total Shots|Total passes|Yellow Cards|fixture_id|team_id|
+---------------+---------------+-------------+------------+-----+----------------+--------+--------+---------------+---------+---------------+--------------+-------------+----------------+-----------+------------+------------+----------+-------+
|     8557341118|            53%|            3|           7|   13|               4|       1|     81%|            355|     null|              6|             4|            3|               4|         10|         436|           1|         8|   null|
|       8557

In [None]:
# Write statistics table to BigQuery
season_table.show()

season_table.write.format('bigquery') \
    .option('table', '{0}.worldcup.statistics'.format(project_id)) \
    .option("temporaryGcsBucket","data_de2022_ng") \
    .mode('overwrite').save()