In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col, sum, row_number
from pyspark.sql.window import Window

# Spark Configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Top_Attacking_Teams_Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup Hadoop configuration for GCS
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Load tables into DataFrames
playersDF = spark.read.format("csv").option("header", "true") \
    .load("gs://data_assignment2/players_table.csv")
teamsDF = spark.read.format("csv").option("header", "true") \
    .load("gs://data_assignment2/teams_table.csv")
performanceDF = spark.read.format("csv").option("header", "true") \
    .load("gs://data_assignment2/player_performance_table.csv")

# Display initial data
playersDF.show(5)
teamsDF.show(5)
performanceDF.show(5)

+-----------------+-------+----+-----+--------------+
|           Player| Nation| Age|  Pos|         Squad|
+-----------------+-------+----+-----+--------------+
|       Max Aarons|eng ENG|23.0|   DF|   Bournemouth|
| Brenden Aaronson| us USA|22.0|MF,FW|  Union Berlin|
|  Paxten Aaronson| us USA|19.0|   MF|Eint Frankfurt|
|Keyliane Abdallah| fr FRA|17.0|   FW|     Marseille|
| Yunis Abdelhamid| ma MAR|35.0|   DF|         Reims|
+-----------------+-------+----+-----+--------------+
only showing top 5 rows

+--------------+------------------+
|         Squad|              Comp|
+--------------+------------------+
|   Bournemouth|eng Premier League|
|  Union Berlin|     de Bundesliga|
|Eint Frankfurt|     de Bundesliga|
|     Marseille|        fr Ligue 1|
|         Reims|        fr Ligue 1|
+--------------+------------------+
only showing top 5 rows

+-----------------+--------------+---+------+----+---+---+---+---+---+
|           Player|         Squad| MP|Starts| Min|Gls|Ast|G+A| xG|xAG

In [2]:
# Filter and cast relevant columns for analysis
performanceDF = performanceDF.select("Player", "Squad", "Gls").filter(col("Gls").isNotNull())
performanceDF = performanceDF.withColumn("Gls", col("Gls").cast("int"))

In [8]:
# Join performanceDF with teamsDF to include competition (Comp)
teamsDF = teamsDF.select("Squad", "Comp")  # Keep only relevant columns
performanceWithCompDF = performanceDF.join(teamsDF, on="Squad", how="inner")

In [13]:
# Aggregate total goals by Squad and Comp
teamStatsDF = performanceWithCompDF.groupBy("Squad", "Comp").agg(
    sum("Gls").alias("Total_Gls")
)

In [14]:
teamStatsDF.head(10)

[Row(Squad='Reims', Comp='fr Ligue 1', Total_Gls=40),
 Row(Squad='Milan', Comp='it Serie A', Total_Gls=76),
 Row(Squad='Marseille', Comp='fr Ligue 1', Total_Gls=50),
 Row(Squad='Sevilla', Comp='es La Liga', Total_Gls=46),
 Row(Squad='Real Sociedad', Comp='es La Liga', Total_Gls=48),
 Row(Squad='Atalanta', Comp='it Serie A', Total_Gls=72),
 Row(Squad='Köln', Comp='de Bundesliga', Total_Gls=28),
 Row(Squad='Strasbourg', Comp='fr Ligue 1', Total_Gls=36),
 Row(Squad='Udinese', Comp='it Serie A', Total_Gls=36),
 Row(Squad='Cagliari', Comp='it Serie A', Total_Gls=41)]

In [19]:
# Rank teams globally by Total_Gls
teamRankingWindow = Window.orderBy(col("Total_Gls").desc())
rankedTeamsDF = teamStatsDF.withColumn("Rank", row_number().over(teamRankingWindow))

In [25]:
rankedTeamsDF.show(20)


+---------------+------------------+---------+----+
|          Squad|              Comp|Total_Gls|Rank|
+---------------+------------------+---------+----+
|Manchester City|eng Premier League|       94|   1|
|  Bayern Munich|     de Bundesliga|       93|   2|
|          Inter|        it Serie A|       87|   3|
|     Leverkusen|     de Bundesliga|       87|   4|
|        Arsenal|eng Premier League|       86|   5|
|    Real Madrid|        es La Liga|       85|   6|
|         Girona|        es La Liga|       84|   7|
|  Newcastle Utd|eng Premier League|       83|   8|
|      Liverpool|eng Premier League|       80|   9|
|      Stuttgart|     de Bundesliga|       78|  10|
|      Paris S-G|        fr Ligue 1|       78|  11|
|          Milan|        it Serie A|       76|  12|
|        Chelsea|eng Premier League|       76|  13|
|      Barcelona|        es La Liga|       76|  14|
|     RB Leipzig|     de Bundesliga|       74|  15|
|       Atalanta|        it Serie A|       72|  16|
|    Aston V

In [27]:
bucket = "temp_assignment2"  # Replace with your GCS bucket name
spark.conf.set('temporaryGcsBucket', bucket)

rankedTeamsDF.write.csv("gs://temp_assignment2/best_20_teams.csv", header=True)
# Save to BigQuery
rankedTeamsDF.write.format('bigquery') \
  .option('table', 'de-assignment2-group10.Pipeline_team_stats.top_attacking_teams') \
  .mode("overwrite") \
  .save()

In [28]:
# Stop the spark context
spark.stop()