In [1]:
# Imports
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col, dense_rank, coalesce
from pyspark.sql import Row, Window

In [2]:
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077") # Do not change
sparkConf.setAppName("pipeline_1_app") # Change to app name
sparkConf.set("spark.driver.memory", "2g") # Do not change
sparkConf.set("spark.executor.cores", "1") # Do not change
sparkConf.set("spark.driver.cores", "1") # Do not change

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

### What are the best performaning attacking players of each team?
List the top 3 players for each team based on goal contributions

### Load data from Google Cloud Bucket

In [3]:
# Load data from Google Cloud Storage
# Google Storage File Path
gsc_root_path = 'gs://data_a2/'  #  Change name to Google Cloud Bucket. Also upload data files first

# Load player assists data and create dataframe
dataSchema_assits = StructType(
        [StructField("rank_assists", IntegerType(), True),
         StructField("player_name_a", StringType(), True),
         StructField("team_name_a", StringType(), True),
         StructField("exp_assists", FloatType(), True),
         StructField("assists", FloatType(), True),
         StructField("minutes_a", IntegerType(), True),       
         StructField("matches_a", IntegerType(), True),
         StructField("country_a", StringType(), True)
         ])
df_player_assists = spark.read.schema(dataSchema_assits).format("csv").option("header", "true") \
       .load(gsc_root_path + 'player_expected_assists.csv')

# Load player goals data and create dataframe
dataSchema_goals = StructType(
        [StructField("rank_goals", IntegerType(), True),
         StructField("player_name_g", StringType(), True),
         StructField("team_name_g", StringType(), True),
         StructField("exp_goals", FloatType(), True),
         StructField("goals", FloatType(), True),
         StructField("minutes_g", IntegerType(), True),       
         StructField("matches_g", IntegerType(), True),
         StructField("country_g", StringType(), True)
         ])
df_player_goals = spark.read.schema(dataSchema_goals).format("csv").option("header", "true") \
       .load(gsc_root_path + 'player_expected_goals.csv')

In [4]:
df_player_assists.show(5)

df_player_goals.show(5)

+------------+------------------+--------------+-----------+-------+---------+---------+---------+
|rank_assists|     player_name_a|   team_name_a|exp_assists|assists|minutes_a|matches_a|country_a|
+------------+------------------+--------------+-----------+-------+---------+---------+---------+
|           1|       Rafael Leao|         Milan|        8.1|    9.0|     2523|       34|      POR|
|           2|  Federico Dimarco|Internazionale|        7.5|    6.0|     2105|       30|      ITA|
|           3|Albert Gudmundsson|         Genoa|        6.5|    4.0|     3022|       35|      ISL|
|           4|   Matteo Politano|        Napoli|        6.0|    7.0|     2384|       37|      ITA|
|           5|  Hakan Calhanoglu|Internazionale|        5.8|    3.0|     2576|       32|      TUR|
+------------+------------------+--------------+-----------+-------+---------+---------+---------+
only showing top 5 rows

+----------+----------------+--------------+---------+-----+---------+---------+----

### Join dataframes

In [5]:
# join assists and goals dataframes on player

# Join the dataframes on player_name with "outer" so that all players from both dataframes are included
joinExpression = df_player_assists["player_name_a"] == df_player_goals['player_name_g']
df_player = df_player_assists.join(df_player_goals, joinExpression, "outer")

# Combine common/duplicate columns
# Combine player name columns
df_player = df_player.withColumn("player_name", coalesce(df_player["player_name_a"], df_player["player_name_g"]))
df_player = df_player.drop("player_name_a", "player_name_g")
# Combine minutes columns
df_player = df_player.withColumn("minutes", coalesce(df_player["minutes_a"], df_player["minutes_g"]))
df_player = df_player.drop("minutes_a", "minutes_g")
# Combine matches columns
df_player = df_player.withColumn("matches", coalesce(df_player["matches_a"], df_player["matches_g"]))
df_player = df_player.drop("matches_a", "matches_g")
# Combine country columns
df_player = df_player.withColumn("country", coalesce(df_player["country_a"], df_player["country_g"]))
df_player = df_player.drop("country_a", "country_g")
# Combine team name columns
df_player = df_player.withColumn("team_name", coalesce(df_player["team_name_a"], df_player["team_name_g"]))
df_player = df_player.drop("team_name_a", "team_name_g")

# Fill all NULL values with 0
df_player = df_player.na.fill(0)

df_player.show(5)

+------------+-----------+-------+----------+---------+-----+-------------+-------+-------+-------+---------+
|rank_assists|exp_assists|assists|rank_goals|exp_goals|goals|  player_name|minutes|matches|country|team_name|
+------------+-----------+-------+----------+---------+-----+-------------+-------+-------+-------+---------+
|         116|        1.7|    1.0|       395|      0.2|  0.0|Aaron Caricol|   1380|     22|    ESP|    Genoa|
|         149|        1.4|    3.0|       102|      3.1|  3.0|Abdou Harroui|   1025|     18|    NED|Frosinone|
|         197|        1.1|    0.0|       347|      0.4|  1.0| Adam Marusic|   3106|     37|    MNE|    Lazio|
|         319|        0.4|    0.0|       356|      0.4|  0.0|  Adam Masina|   1083|     20|    MAR|   Torino|
|         268|        0.7|    0.0|       422|      0.1|  0.0|   Adam Obert|    940|     17|    SVK| Cagliari|
+------------+-----------+-------+----------+---------+-----+-------------+-------+-------+-------+---------+
only showi

### Create new columns

In [6]:
# Create a new column of goals + assists
df_player = df_player.selectExpr(
"*",
"(assists + goals) as goal_contributions")

# Create a new column of expected goals + expected assists
df_player = df_player.selectExpr(
"*",
"(exp_assists + exp_goals) as exp_goal_contributions")

# Create a new column of performance compared to the expected performance
df_player = df_player.selectExpr(
"*",
"(goal_contributions - exp_goal_contributions) as performance_difference")

df_player.show(5)

+------------+-----------+-------+----------+---------+-----+-------------+-------+-------+-------+---------+------------------+----------------------+----------------------+
|rank_assists|exp_assists|assists|rank_goals|exp_goals|goals|  player_name|minutes|matches|country|team_name|goal_contributions|exp_goal_contributions|performance_difference|
+------------+-----------+-------+----------+---------+-----+-------------+-------+-------+-------+---------+------------------+----------------------+----------------------+
|         116|        1.7|    1.0|       395|      0.2|  0.0|Aaron Caricol|   1380|     22|    ESP|    Genoa|               1.0|             1.9000001|            -0.9000001|
|         149|        1.4|    3.0|       102|      3.1|  3.0|Abdou Harroui|   1025|     18|    NED|Frosinone|               6.0|                   4.5|                   1.5|
|         197|        1.1|    0.0|       347|      0.4|  1.0| Adam Marusic|   3106|     37|    MNE|    Lazio|               1

### Create dataframe with top 3 of each team

In [7]:
# Order the players on goal contributions per team
# Define the window
window = Window.partitionBy(col("team_name")).orderBy(col("goal_contributions").desc())

# Appply window and add the column of the rank within the team based on goal contributions
df_player_windowed = df_player.withColumn("rank_in_team", dense_rank().over(window))

# Select only players in top 3 of each team
df_player_top3 = df_player_windowed.where("rank_in_team <= 3")

df_player_top3.show(30)

+------------+-----------+-------+----------+---------+-----+--------------------+-------+-------+-------+--------------+------------------+----------------------+----------------------+------------+
|rank_assists|exp_assists|assists|rank_goals|exp_goals|goals|         player_name|minutes|matches|country|     team_name|goal_contributions|exp_goal_contributions|performance_difference|rank_in_team|
+------------+-----------+-------+----------+---------+-----+--------------------+-------+-------+-------+--------------+------------------+----------------------+----------------------+------------+
|          13|        4.8|    7.0|        18|      9.3| 11.0|     Ademola Lookman|   1899|     31|    NGA|      Atalanta|              18.0|                  14.1|             3.8999996|           1|
|          25|        4.1|    8.0|        26|      7.3| 10.0|Charles De Ketelaere|   2042|     35|    BEL|      Atalanta|              18.0|                  11.4|             6.6000004|           1|


In [8]:
# Create final dataframe for visualization
df_final = df_player_top3.select("team_name", "player_name", "goal_contributions", "exp_goal_contributions", "performance_difference", "rank_in_team")

# Rename columns
df_final = (df_final
            .withColumnRenamed("team_name", "team")
            .withColumnRenamed("player_name", "player")
           )

df_final.show()

+----------+--------------------+------------------+----------------------+----------------------+------------+
|      team|              player|goal_contributions|exp_goal_contributions|performance_difference|rank_in_team|
+----------+--------------------+------------------+----------------------+----------------------+------------+
|  Atalanta|     Ademola Lookman|              18.0|                  14.1|             3.8999996|           1|
|  Atalanta|Charles De Ketelaere|              18.0|                  11.4|             6.6000004|           1|
|  Atalanta|   Gianluca Scamacca|              18.0|                   8.0|                  10.0|           1|
|  Atalanta|    Teun Koopmeiners|              17.0|                  12.0|                   5.0|           2|
|  Atalanta|       Mario Pasalic|              12.0|                  10.6|             1.3999996|           3|
|   Bologna|      Joshua Zirkzee|              15.0|                  11.5|                   3.5|      

### Saving data to BigQuery (Data Sink)

In [9]:
# Google always uses Google Cloud storage to save table and it is then copied to BigQuery
# Therefore we need to create a temporary bucket
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_a2"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket) # Do not change

# Saving the data to BigQuery
# BigQuery will create table if there is no table, probably have to try first
df_final.write.format('bigquery') \
  .option('table', 'data-engineering-435408.a2_dataset.player_data_offense') \
  .mode("overwrite") \
  .save() 

In [10]:
# Stop the spark context
spark.stop()