# Overview of the Code

This code performs a comprehensive data analysis process using PySpark. It combines multiple datasets to compute player performance metrics and generate insights. Below is a general description of the steps involved:


## Data loading

- Create and configure Spark environment
- Configures Hadoop to support Google Cloud Storage as a filesystem.
- Enables seamless reading of files stored i#n GCS.

- Reads datasets from a GCS bucket into Spark DataFrames.




In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

# Configure Spark with the necessary details
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")  # Set Spark master URL
sparkConf.setAppName("Goals_pipeline")  # Set the application name
sparkConf.set("spark.driver.memory", "2g")  # Allocate memory for the driver
sparkConf.set("spark.executor.cores", "1")  # Set number of cores for each executor
sparkConf.set("spark.driver.cores", "1")  # Set number of cores for the driver

# Create a Spark session, which acts as the entry point for Spark SQL operations
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Set up Hadoop filesystem configuration for Google Cloud Storage (GCS)
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")  # Specify GCS file system implementation
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")  # Specify GCS abstract file system

# Define the base path for Google Cloud Storage bucket
base_gcs_path = "gs://data_bucket/"

# Load datasets from GCS into Spark DataFrames
# Load the "appearances" dataset
appearances = spark.read.csv(
    f"{base_gcs_path}appearances.csv",  # Path to the appearances CSV file in GCS
    header=True,  # Use the first row as header
    inferSchema=True  # Automatically infer the schema
)

# Load the "game_events" dataset
game_event = spark.read.csv(
    f"{base_gcs_path}game_events.csv",  # Path to the game_events CSV file in GCS
    header=True,  # Use the first row as header
    inferSchema=True  # Automatically infer the schema
)

# Load the "players" dataset
players = spark.read.csv(
    f"{base_gcs_path}players.csv",  # Path to the players CSV file in GCS
    header=True,  # Use the first row as header
    inferSchema=True  # Automatically infer the schema
)

# Load the "competitions" dataset
competitions = spark.read.csv(
    f"{base_gcs_path}competitions.csv",  # Path to the competitions CSV file in GCS
    header=True,  # Use the first row as header
    inferSchema=True  # Automatically infer the schema
)


### Calculate all the correct values

- The feature that need calculation are calculated
- The columns that are needed are selected



In [None]:
from pyspark.sql.functions import col, count, sum, lit, concat, round

# Filter goals from game_events dataset based on 'type'
# Group by player_id and count the number of goals for each player
goals = game_event.filter(col('type') == 'Goals') \
    .groupBy("player_id") \
    .agg(count("type").alias("total_goals"))

# Calculate total playing time (in minutes) per player
# Group by player_id and sum the 'minutes_played' column
total_minutes = appearances.groupBy("player_id") \
    .agg(sum("minutes_played").alias("total_minutes"))

# Count total appearances per player from the appearances dataset
app = appearances.groupBy("player_id") \
    .count() \
    .alias('total_appearances')

# Select relevant player details and rename columns for clarity
# Adding player information such as name, ID, competition, club name, and position
players_name = players.select(
    "name",
    "player_id",
    "current_club_domestic_competition_id",
    "current_club_name",
    "position"
) \
    .withColumnRenamed("name", "player_name") \
    .withColumnRenamed("current_club_domestic_competition_id", "competition_id")

# Create a new column with player information in a concatenated string format
# Format: "player_name (player_id)"
player_name_info = players_name.withColumn(
    "player_info",
    concat(players_name["player_name"], lit(" ("), players_name["player_id"], lit(")"))
)

# Select relevant competition details and rename columns for clarity
# Adding competition name and competition ID
competitions_name = competitions.select(
    "name",
    "competition_id"
).withColumnRenamed("name", "competition_name")

# Display a sample of the appearances count
app.show(5)

# Display a sample of the player information with the concatenated string column
player_name_info.show(5)


### Combine the dataframes and select the correct columns

- This section combines the individual dataframes created in the previous steps into a single comprehensive dataframe.

- A series of joins based on relevant columns such as `player_id` and `competition_id` are performed.

- New columns are calculated based on the combined data, such as goals per 90 minutes played and average minutes per game.

- The dataframe is filtered and ordered to focus on players with sufficient appearances and to highlight top performers.

In [None]:
# Join player information with competition details
# Combine `player_name_info` with `competitions_name` using `competition_id`
player_with_competition = player_name_info.join(competitions_name, "competition_id", "inner")

# Add goal counts to player information
# Join `goals` with the combined player and competition data using `player_id`
goals_with_name = goals.join(player_with_competition, "player_id", "inner")

# Add total appearances to the dataset
# Join `goals_with_name` with appearance counts (`app`) using `player_id`
join_goals_app = goals_with_name.join(app, "player_id", "inner")

# Add total playing time to the dataset
# Join `join_goals_app` with total minutes played using `player_id`
goals_per_90 = join_goals_app.join(total_minutes, "player_id", "inner")

# Calculate goals per 90 minutes
# Formula: (total_goals / total_minutes) * 90
goals_per_90 = goals_per_90.withColumn("goals_per_90", (col("total_goals") / col("total_minutes")) * 90)

# Calculate average minutes played per game
# Formula: total_minutes / total_appearances
goals_per_90 = goals_per_90.withColumn(
    "average_minutes_per_game",
    col("total_minutes") / col("count")
)

# Round calculated values for better readability
goals_per_90 = goals_per_90.withColumn(
    "goals_per_90",
    round(col("goals_per_90"), 4)  # Round goals per 90 minutes to 4 decimal places
)

goals_per_90 = goals_per_90.withColumn(
    "average_minutes_per_game",
    round(col("average_minutes_per_game"), 4)  # Round average minutes per game to 4 decimal places
)

# Select relevant columns for the final dataset
# Include player details, performance metrics, and competition details
goals_per_90 = goals_per_90.select(
    col("player_id"),               # Player ID
    col("player_name"),             # Player name
    col("position"),                # Player position
    col("total_goals"),             # Total goals scored
    col("total_minutes"),           # Total minutes played
    col("goals_per_90"),            # Goals per 90 minutes
    col("current_club_name"),       # Current club name
    col("count").alias("appearances"),  # Total appearances (renamed for clarity)
    col("average_minutes_per_game"),   # Average minutes per game
    col("competition_name"),        # Competition name
    col("player_info")              # Player information (concatenated string)
)

# Filter players with at least 10 appearances
goals_per_90 = goals_per_90.filter(col("appearances") >= 10)

# Order the dataset by goals per 90 minutes in descending order
goals_per_90 = goals_per_90.orderBy(col('goals_per_90').desc())

# Display the top 100 players based on goals per 90 minutes
goals_per_90.show(100)


### Exporting to BigQuery

This cell exports the processed data to a BigQuery table.

In [None]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2024_bz"  # use your bucket
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
goals_per_90.write.format('bigquery') \
  .option('table', 'de2024-435509.labdataset.football_version2') \
  .mode("overwrite") \
  .save()

In [None]:
# Stop the spark context
spark.stop()