In [6]:
import pandas as pd


df = pd.read_csv('male_players.csv', low_memory=False)
df['gk'] = df['gk'].astype(str)

df.to_parquet('fifa_player.parquet', engine='pyarrow', compression='snappy')

print("Conversion complete.")


Conversion complete.


In [7]:
df = pd.read_csv('male_teams.csv', low_memory = False)
df.to_parquet('teams.parquet' , engine = 'pyarrow', compression = 'snappy')
print('conversion complete')

conversion complete


In [8]:
df = pd.read_csv('male_coaches.csv', low_memory = False)
df.to_parquet('coaches.parquet' , engine = 'pyarrow', compression = 'snappy')
print('conversion complete')

conversion complete


In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, avg, count

# --- 1. SETUP: Inizializza Spark ---
spark = SparkSession.builder \
    .appName("FIFA_Analysis") \
    .getOrCreate()

# --- 2. CARICAMENTO: Leggi il file Parquet ---
# Assicurati che il file sia nella cartella montata
df = spark.read.parquet("fifa_player.parquet")

print(f"Totale giocatori caricati: {df.count()}")
df = df.withColumn("fifa_version", col("fifa_version").cast("int"))
df = df.withColumn("value_eur", col("value_eur").cast("int"))


Totale giocatori caricati: 180021


In [11]:
# --- QUERY 1: Chi sono i 5 giocatori più pagati? ---
print(">>> TOP 5 GIOCATORI PER STIPENDIO (Wage):")
df.select("short_name", "age", "club_name", "wage_eur", 'fifa_version') \
  .orderBy(desc("wage_eur")) \
  .show(5)

# --- QUERY 2: Giovani Promesse (Under 21 con valore alto) ---
print(">>> GIOVANI PROMESSE (Under 21, ordinati per Valore di mercato):")
df.filter(col("age") < 21) \
  .select("short_name", "age", "club_name", "value_eur", 'fifa_version') \
  .orderBy(desc("value_eur")) \
  .show(5)

# --- QUERY 3: Statistiche per Club (Stipendio Medio) ---
print(">>> CLUB PIÙ RICCHI (Stipendio medio, min. 10 giocatori):")
df.groupBy("club_name") \
  .agg(
      avg("wage_eur").alias("avg_wage"),
      count("*").alias("num_players")
  ) \
  .filter(col("num_players") >= 10) \
  .orderBy(desc("avg_wage")) \
  .show(5)

# --- QUERY 4: SQL Puro (Per chi preferisce SQL) ---
print(">>> ANALISI CON SQL (Giocatori Italiani):")
df.createOrReplaceTempView("fifa_players")

# Esempio: Seleziona giocatori italiani
spark.sql("""
    SELECT short_name, club_name, overall, fifa_version
    FROM fifa_players 
    WHERE nationality_name = 'Italy' 
    ORDER BY overall DESC 
    LIMIT 5
""").show()

>>> TOP 5 GIOCATORI PER STIPENDIO (Wage):
+-----------------+---+------------+--------+------------+
|       short_name|age|   club_name|wage_eur|fifa_version|
+-----------------+---+------------+--------+------------+
|         L. Messi| 31|FC Barcelona|575000.0|          19|
|Cristiano Ronaldo| 32| Real Madrid|575000.0|          18|
|         L. Messi| 30|FC Barcelona|575000.0|          18|
|Cristiano Ronaldo| 31| Real Madrid|575000.0|          17|
|         L. Messi| 29|FC Barcelona|575000.0|          17|
+-----------------+---+------------+--------+------------+
only showing top 5 rows

>>> GIOVANI PROMESSE (Under 21, ordinati per Valore di mercato):
+----------+---+-----------------+---------+------------+
|short_name|age|        club_name|value_eur|fifa_version|
+----------+---+-----------------+---------+------------+
|E. Haaland| 20|Borussia Dortmund|  1.375E8|          22|
|J. Musiala| 20|FC Bayern München|  1.345E8|          24|
| J. Sancho| 20|Borussia Dortmund|   1.24E8|   

In [12]:
# Selezioniamo solo giocatori giovani (< 23 anni)
df_growth = df.filter(col("age") < 23) \
    .withColumn("growth_margin", col("potential") - col("overall")) \
    .select("short_name", "age", "club_name", "overall", "potential", "growth_margin") \
    .filter(col("growth_margin") > 10) \
    .orderBy(desc("growth_margin"))

print(">>> Giocatori con il margine di miglioramento più alto:")
df_growth.show(5)

>>> Giocatori con il margine di miglioramento più alto:
+------------+---+-----------------+-------+---------+-------------+
|  short_name|age|        club_name|overall|potential|growth_margin|
+------------+---+-----------------+-------+---------+-------------+
|   C. Burton| 17|  Shrewsbury Town|     50|       78|           28|
| L. Gunnerød| 16|           Stabæk|     50|       76|           26|
|B. Arrey-Mbi| 17|Bayern München II|     60|       86|           26|
|   D. Lobban| 16|         Aberdeen|     53|       79|           26|
|   L. Miller| 16|       Motherwell|     59|       85|           26|
+------------+---+-----------------+-------+---------+-------------+
only showing top 5 rows



In [17]:
# --- IMPORT FONDAMENTALI ---
# Devi dire a Python di usare la 'sum' di Spark, non quella di base
from pyspark.sql.functions import sum, avg, count, col, desc

# Ora il codice funzionerà perché 'sum' è la funzione di Spark
df_finance = df.groupBy("club_name") \
    .agg(
        sum("value_eur").alias("total_squad_value"),
        avg("wage_eur").alias("avg_wage"),
        count("*").alias("player_count")
    ) \
    .filter(col("player_count") > 20) \
    .orderBy(desc("total_squad_value"))

print(">>> I 5 Club con la rosa più costosa:")
df_finance.show(5)

>>> I 5 Club con la rosa più costosa:
+-------------------+-----------------+------------------+------------+
|          club_name|total_squad_value|          avg_wage|player_count|
+-------------------+-----------------+------------------+------------+
|        Real Madrid|       8771315000|       137839.0625|         320|
|    Manchester City|       8288705000|109017.50788643533|         317|
|       FC Barcelona|       8161625000|134633.06188925082|         307|
|  FC Bayern München|       7671955000| 94787.87878787878|         264|
|Paris Saint Germain|       7380115000| 80267.56756756757|         296|
+-------------------+-----------------+------------------+------------+
only showing top 5 rows



In [18]:
df_veterans = df.filter(
    (col("overall") >= 80) & 
    (col("age") > 33) & 
    (col("value_eur") < 5000000)
).select("short_name", "age", "club_name", "overall", "value_eur")

print(">>> Veterani forti a basso costo:")
df_veterans.orderBy(desc("overall")).show(5)

>>> Veterani forti a basso costo:
+-------------+---+-------------------+-------+---------+
|   short_name|age|          club_name|overall|value_eur|
+-------------+---+-------------------+-------+---------+
|    G. Buffon| 39|           Juventus|     89|  4500000|
|    G. Buffon| 40|Paris Saint Germain|     88|  4000000|
|S. Handanovič| 37|              Inter|     84|  4700000|
|  A. Barzagli| 37|           Juventus|     84|  2100000|
|     J. Terry| 35|            Chelsea|     84|  4900000|
+-------------+---+-------------------+-------+---------+
only showing top 5 rows



In [19]:
df_nations = df.filter(col("overall") > 75) \
    .groupBy("nationality_name") \
    .count() \
    .withColumnRenamed("count", "top_players_count") \
    .orderBy(desc("top_players_count"))

print(">>> Nazioni con più giocatori sopra il 75 di overall:")
df_nations.show(5)

>>> Nazioni con più giocatori sopra il 75 di overall:
+----------------+-----------------+
|nationality_name|top_players_count|
+----------------+-----------------+
|           Spain|             1824|
|          Brazil|             1383|
|          France|             1222|
|       Argentina|              978|
|         Germany|              923|
+----------------+-----------------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import col, desc, avg, count, when, year, to_date, lit

In [42]:
print('>>>I COLOSSI:')
df_giants_unique = df.filter((col("height_cm") > 190) & (col("weight_kg") > 90)) \
    .select("short_name", "height_cm", "weight_kg", "club_name", "player_positions",'age','fifa_version') \
    .dropDuplicates(["short_name"]) \
    .orderBy(desc("height_cm"))

df_giants_unique.show(5)

>>>I COLOSSI:
+-----------+---------+---------+---------------+----------------+---+------------+
| short_name|height_cm|weight_kg|      club_name|player_positions|age|fifa_version|
+-----------+---------+---------+---------------+----------------+---+------------+
|K. Van Hout|      207|      110|       Westerlo|              GK| 29|          17|
|K. Gadellaa|      206|       92|     FC Utrecht|              GK| 20|          24|
|   I. Touré|      206|       98|        Lorient|              CB| 20|          24|
|    T. Holý|      206|      102|Carlisle United|              GK| 31|          24|
|        Idé|      204|       96|       Boavista|              ST| 22|          17|
+-----------+---------+---------+---------------+----------------+---+------------+
only showing top 5 rows



In [28]:
print(">>> CONFRONTO PIEDE (Mancini vs Destri):")
# Nota: controlla se la colonna si chiama 'preferred_foot' o 'preferred_foot_name'
df_foot = df.groupBy("preferred_foot") \
    .agg(
        avg("overall").alias("media_voto"),
        count("*").alias("numero_giocatori")
    ) \
    .orderBy(desc("media_voto"))

df_foot.show()

>>> CONFRONTO PIEDE (Mancini vs Destri):
+--------------+-----------------+----------------+
|preferred_foot|       media_voto|numero_giocatori|
+--------------+-----------------+----------------+
|          Left|66.24843606147164|           42361|
|         Right|65.54785703908179|          137660|
+--------------+-----------------+----------------+



In [29]:
print(">>> CAMPIONATI PIÙ GIOVANI (Età media per Lega):")
df_leagues = df.groupBy("league_name") \
    .agg(
        avg("age").alias("avg_age"),
        avg("overall").alias("avg_quality"),
        count("*").alias("total_players")
    ) \
    .filter(col("total_players") > 50) \
    .orderBy("avg_age")  # Dal più giovane al più vecchio

df_leagues.show(5)

>>> CAMPIONATI PIÙ GIOVANI (Età media per Lega):
+------------------+------------------+-----------------+-------------+
|       league_name|           avg_age|      avg_quality|total_players|
+------------------+------------------+-----------------+-------------+
|        Eredivisie|23.347501555048726|66.80053908355795|         4823|
|  Premier Division|23.953707188558525|56.33646970267219|         2657|
|         Superliga|24.051274178692047|62.61068467915259|         3257|
|Jupiler Pro League| 24.13677130044843|            66.95|         4460|
|           Ligue 1|24.157822327044027|69.57861635220125|         5088|
+------------------+------------------+-----------------+-------------+
only showing top 5 rows



In [34]:
print(">>> OCCASIONI DI MERCATO (In scadenza a breve):")
# Filtriamo contratti che scadono, ad esempio, nel 2024 o 2025
df_expiry = df.filter(
        (col("overall") > 75) & 
        (col("club_contract_valid_until_year").isin([2024, 2025]))
    ) \
    .select("short_name", "club_name", "overall", "value_eur", "club_contract_valid_until_year") \
    .orderBy(desc("overall"))

df_expiry.show(5)

>>> OCCASIONI DI MERCATO (In scadenza a breve):
+----------+--------------------+-------+---------+------------------------------+
|short_name|           club_name|overall|value_eur|club_contract_valid_until_year|
+----------+--------------------+-------+---------+------------------------------+
| X. Simons|          RB Leipzig|     79| 38500000|                        2024.0|
|  M. Diaby| Bayer 04 Leverkusen|     84| 61500000|                        2025.0|
| T. Partey|             Arsenal|     84| 36000000|                        2024.0|
|R. Jiménez|Wolverhampton Wan...|     82| 25500000|                        2024.0|
| H. Barnes|      Leicester City|     80| 29000000|                        2025.0|
+----------+--------------------+-------+---------+------------------------------+
only showing top 5 rows



In [35]:
print(">>> DIFENSORI CENTRALI (Anche come secondo ruolo):")
df_defenders = df.filter(col("player_positions").contains("CB")) \
    .select("short_name", "age", "player_positions", "defending", "physic") \
    .orderBy(desc("defending"))

df_defenders.show(5)

>>> DIFENSORI CENTRALI (Anche come secondo ruolo):
+------------+---+----------------+---------+------+
|  short_name|age|player_positions|defending|physic|
+------------+---+----------------+---------+------+
| V. van Dijk| 29|              CB|     91.0|  84.0|
| V. van Dijk| 30|              CB|     91.0|  86.0|
| V. van Dijk| 28|              CB|     91.0|  86.0|
|Sergio Ramos| 32|              CB|     91.0|  84.0|
|G. Chiellini| 33|              CB|     91.0|  82.0|
+------------+---+----------------+---------+------+
only showing top 5 rows



In [36]:
print(">>> CLASSIFICAZIONE PER ALTEZZA:")
df_height_cat = df.withColumn("fascia_altezza", 
      when(col("height_cm") < 170, "Basso")
      .when((col("height_cm") >= 170) & (col("height_cm") < 185), "Medio")
      .otherwise("Alto")
    )

df_height_cat.groupBy("fascia_altezza").count().show()

>>> CLASSIFICAZIONE PER ALTEZZA:
+--------------+------+
|fascia_altezza| count|
+--------------+------+
|         Basso|  6639|
|         Medio|112698|
|          Alto| 60684|
+--------------+------+



In [60]:
df.filter(col("short_name").contains("van Dijk")) \
  .select("short_name", "long_name", "height_cm", "club_name",'fifa_version','age','overall') \
  .orderBy(desc('age')) \
  .show()


+-----------+---------------+---------+-----------+------------+---+-------+
| short_name|      long_name|height_cm|  club_name|fifa_version|age|overall|
+-----------+---------------+---------+-----------+------------+---+-------+
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          24| 31|     89|
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          23| 30|     90|
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          22| 29|     89|
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          21| 28|     90|
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          20| 27|     90|
|V. van Dijk|Virgil van Dijk|      193|  Liverpool|          19| 26|     85|
|V. van Dijk|Virgil van Dijk|      193|Southampton|          18| 25|     83|
|V. van Dijk|Virgil van Dijk|      193|Southampton|          17| 24|     79|
|V. van Dijk|Virgil van Dijk|      193|     Celtic|          15| 22|     75|
|L. van Dijk|   Lex van Dijk|      182|  Willem II|          18| 21|     60|