In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, avg, count, when, year, to_date, lit
import pandas as pd
import matplotlib.pyplot as plt

In [34]:
#load the first dataset about the fifa players
df = pd.read_csv('data/raw/male_players.csv', low_memory=False)
#set the column gk (goalkeeper) as String
df['gk'] = df['gk'].astype(str)
#transform the dataset into parquet file 
df.to_parquet('data/processed/fifa_player.parquet', engine='pyarrow', compression='snappy')
print("Conversion complete.")

Conversion complete.


In [35]:
#load the second dataset about teams
df = pd.read_csv('data/raw/male_teams.csv', low_memory = False)
#transform the dataset into parquet file
df.to_parquet('data/processed/teams.parquet' , engine = 'pyarrow', compression = 'snappy')
print('conversion complete')

conversion complete


In [36]:
#Load dataset about coaches
df = pd.read_csv('data/raw/male_coaches.csv', low_memory = False)
#transform into parquet
df.to_parquet('data/processed/coaches.parquet' , engine = 'pyarrow', compression = 'snappy')
print('conversion complete')

conversion complete


In [37]:
#start the spark session
spark = SparkSession.builder \
    .appName("FIFA_Analysis") \
    .getOrCreate()

#read the file as parquet
df = spark.read.parquet("data/processed/fifa_player.parquet")
print(f"Player loaded: {df.count()}")

#transform the columns into integers
df = df.withColumn("fifa_version", col("fifa_version").cast("int"))
df = df.withColumn("value_eur", col("value_eur").cast("int"))


Totale giocatori caricati: 180021


In [54]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Check_Columns").getOrCreate()

df_players = spark.read.parquet("data/processed/fifa_player.parquet")
df_teams = spark.read.parquet("data/processed/teams.parquet")
df_coaches = spark.read.parquet("data/processed/coaches.parquet")

def find_keys(df, nome_file):
    cols = df.columns
    # Cerca colonne che contengono parole chiave per il merge
    chiavi = [c for c in cols if "id" in c or "team" in c or "coach" in c]
    print(f"--- Possible keys in {nome_file} ---")
    print(chiavi)
    print("-" * 30)

# 4. Esegui la ricerca
find_keys(df_players, "PLAYERS")
find_keys(df_teams, "TEAMS")
find_keys(df_coaches, "COACHES")

--- Possible keys in PLAYERS ---
['player_id', 'club_team_id', 'league_id', 'club_contract_valid_until_year', 'nationality_id', 'nation_team_id', 'defending_sliding_tackle']
------------------------------
--- Possible keys in TEAMS ---
['team_id', 'team_url', 'team_name', 'league_id', 'nationality_id', 'midfield', 'coach_id', 'rival_team', 'whole_team_average_age', 'def_team_width', 'def_team_depth', 'def_defence_width', 'off_team_width']
------------------------------
--- Possible keys in COACHES ---
['coach_id', 'coach_url', 'coach_face_url']
------------------------------


In [57]:
df_teams_full = df_teams.join(
    df_coaches, 
    on="coach_id", 
    how="left"
)

df_final = df_players.join(
    df_teams_full,
    df_players["club_team_id"] == df_teams_full["team_id"],
    how="left"
)

print(f"Total Players: {df_final.count()}")

Totale Giocatori: 1417641


In [68]:
df_joined = df_players.join(df_teams, df_players.club_team_id == df_teams.team_id, "inner") \
    .filter((col("value_eur") > 20000000) & (col("transfer_budget_eur") > 50000000)) \
    .select(
        col("short_name").alias("Giocatore"),
        col("value_eur").cast('long').alias("Valore"),
        col("club_name").alias("Squadra")
    )
df_final = df_joined.dropDuplicates(['Giocatore']) 

df_final.orderBy(desc("Valore")).show(20, truncate=False)

+----------------+---------+-------------------+
|Giocatore       |Valore   |Squadra            |
+----------------+---------+-------------------+
|E. Haaland      |185000000|Manchester City    |
|K. Mbappé       |181500000|Paris Saint Germain|
|Vini Jr.        |158500000|Real Madrid        |
|J. Musiala      |134500000|FC Bayern München  |
|F. Valverde     |130500000|Real Madrid        |
|V. Osimhen      |126500000|Napoli             |
|H. Kane         |119500000|FC Bayern München  |
|K. Kvaratskhelia|109000000|Napoli             |
|M. Ødegaard     |109000000|Arsenal            |
|L. Martínez     |107000000|Inter              |
|Rúben Dias      |106500000|Manchester City    |
|Rodri           |105500000|Manchester City    |
|Pedri           |105000000|FC Barcelona       |
|F. de Jong      |103500000|FC Barcelona       |
|K. De Bruyne    |103000000|Manchester City    |
|Rafael Leão     |102500000|Milan              |
|J. Bellingham   |100500000|Real Madrid        |
|Neymar Jr       |99

In [74]:
df_players = df_players.withColumnRenamed("short_name", "player_name")
df_coaches = df_coaches.withColumnRenamed("short_name", "coach_name")

In [75]:
df_complex = df_players.join(df_teams, df_players.club_team_id == df_teams.team_id, "inner") \
    .join(df_coaches, df_teams.coach_id == df_coaches.coach_id, "inner") \
    .filter(
        (col("age") < 25) &                       
        (col("value_eur") > 40000000) &           
        (col("transfer_budget_eur") > 0)          
    ) \
    .select(
        col("player_name").alias("Giocatore"),
        col("age").alias("Età"),
        col("club_name").alias("Squadra"),
        col("coach_name").alias("Allenatore"), 
        
        col("value_eur").cast("long").alias("Valore"),
        col("transfer_budget_eur").cast("long").alias("Budget_Club"),
    )
df_final = df_complex.dropDuplicates(['Giocatore'])

print("=== YOUNG TALENTS ===")
df_final.orderBy(desc("Valore")).show(20, truncate=False)

=== GIOVANI TALENTI E I LORO ALLENATORI ===
+----------------+---+-------------------+-------------------+---------+-----------+
|Giocatore       |Età|Squadra            |Allenatore         |Valore   |Budget_Club|
+----------------+---+-------------------+-------------------+---------+-----------+
|E. Haaland      |22 |Manchester City    |J. Guardiola i Sala|185000000|80000000   |
|K. Mbappé       |24 |Paris Saint Germain|C. Galtier         |181500000|120000000  |
|Vini Jr.        |22 |Real Madrid        |C. Ancelotti       |158500000|95000000   |
|J. Musiala      |20 |FC Bayern München  |J. Nagelsmann      |134500000|55000000   |
|F. Valverde     |24 |Real Madrid        |C. Ancelotti       |130500000|95000000   |
|V. Osimhen      |24 |Napoli             |L. Spalletti       |126500000|26000000   |
|F. de Jong      |24 |FC Barcelona       |X. Hernández       |119500000|80000000   |
|K. Kvaratskhelia|22 |Napoli             |L. Spalletti       |109000000|26000000   |
|M. Ødegaard     |24 