 ## Transformation des données

Nettoyage : suppression ou traitement des valeurs manquantes et incohérentes.
Standardisation : harmonisation des unités, formats de date et noms de colonnes pour garantir la cohérence et la qualité des données.

In [2]:
import os
import pandas as pd

SILVER_PATH = "../../data/silver"

# Créer un DataFrame vide avec les colonnes attendues
df_final_match = pd.DataFrame(columns=[
    'eq1', 'eq2', 'date_match', 'heure', 'round', 'jour', 'arbitre', 'idcompetition', 'idSaison'
])

teams = [d for d in os.listdir(SILVER_PATH) if os.path.isdir(os.path.join(SILVER_PATH, d))]

for team_dir in teams:
    team_name = team_dir.replace('_', ' ')
    file_path = os.path.join(SILVER_PATH, team_dir, f"{team_dir}_matches_clean.csv")
    df = pd.read_csv(file_path)
    df['team'] = team_name

    for _, row in df.iterrows():
        eq1 = row['team']
        eq2 = row['Opponent']
        date = row['Date']

        # Vérifie si un match identique existe déjà (ordre inversé inclus)
        existe = ((df_final_match['eq1'] == eq1) & (df_final_match['eq2'] == eq2) & (df_final_match['date_match'] == date)) | \
                 ((df_final_match['eq1'] == eq2) & (df_final_match['eq2'] == eq1) & (df_final_match['date_match'] == date))
        
        if not existe.any():
            # Ajouter le match au DataFrame final
            df_final_match = pd.concat([df_final_match, pd.DataFrame([{
                'eq1': eq1,
                'eq2': eq2,
                'date_match': date,
                'heure': row["Time"],
                'round': row["Round"],
                'jour': row["Day"],
                'arbitre': row["Referee"],
                'idcompetition': 1,
                'idSaison': 1
            }])], ignore_index=True)

print(df_final_match.head())
print("Nombre total de matchs uniques :", len(df_final_match))


# Enregistrer dans gold
GOLD_PATH = "../../data/gold"
os.makedirs(GOLD_PATH, exist_ok=True)
file_path = os.path.join(GOLD_PATH, "matches_unique.csv")
df_final_match.to_csv(file_path, index=False)
print(f"✅ DataFrame enregistrée dans {file_path}")


       eq1          eq2  date_match          heure         round jour  \
0  Arsenal       Wolves  2024-08-17          15:00   Matchweek 1  Sat   
1  Arsenal  Aston Villa  2024-08-24          17:30   Matchweek 2  Sat   
2  Arsenal     Brighton  2024-08-31          12:30   Matchweek 3  Sat   
3  Arsenal    Tottenham  2024-09-15          14:00   Matchweek 4  Sun   
4  Arsenal  it Atalanta  2024-09-19  21:00 (20:00)  League phase  Thu   

          arbitre idcompetition idSaison  
0  Jarred Gillett             1        1  
1  Michael Oliver             1        1  
2  Chris Kavanagh             1        1  
3  Jarred Gillett             1        1  
4  Clément Turpin             1        1  
Nombre total de matchs uniques : 559
✅ DataFrame enregistrée dans ../../data/gold\matches_unique.csv


In [1]:
import os
import pandas as pd

SILVER_PATH = "../../data/silver"

# Créer le DataFrame final avec les colonnes voulues
df_final_result_match = pd.DataFrame(columns=[
    'match', 'equipe', 'venue', 'butsmarques', 'butsconcedes',
    'resultat', 'XG', 'XGA', 'possession', 'capitaine', 'formation'
])

teams = [d for d in os.listdir(SILVER_PATH) if os.path.isdir(os.path.join(SILVER_PATH, d))]

for team_dir in teams:
    team_name = team_dir.replace('_', ' ')
    file_path = os.path.join(SILVER_PATH, team_dir, f"{team_dir}_matches_clean.csv")

    if not os.path.exists(file_path):
        continue

    df = pd.read_csv(file_path)
    df['team'] = team_name

    for _, row in df.iterrows():
        eq1 = row['team']
        eq2 = row['Opponent']
        date = str(row['Date'])

        match_id = f"{date}_{eq1}_vs_{eq2}"

        df_final_result_match = pd.concat([df_final_result_match, pd.DataFrame([{
            'match': match_id,
            'equipe': eq1,
            'venue': row['Venue'],
            'butsmarques': row['GF'],
            'butsconcedes': row['GA'],
            'resultat': row['Result'],
            'XG': row['xG'],
            'XGA': row['xGA'],
            'possession': row['Poss'],
            'capitaine': row['Captain'],
            'formation': row['Formation']
        }])], ignore_index=True)

print(df_final_result_match.head())
print("Nombre total de matchs :", len(df_final_result_match))

# Enregistrement dans dossier gold
GOLD_PATH = "../../data/gold"
os.makedirs(GOLD_PATH, exist_ok=True)

file_path = os.path.join(GOLD_PATH, "matches_resultat_unique.csv")
df_final_result_match.to_csv(file_path, index=False, encoding='utf-8-sig')
print(f"✅ DataFrame enregistrée dans {file_path}")


  df_final_result_match = pd.concat([df_final_result_match, pd.DataFrame([{


                               match   equipe venue  butsmarques  \
0       2024-08-17_Arsenal_vs_Wolves  Arsenal  Home          2.0   
1  2024-08-24_Arsenal_vs_Aston Villa  Arsenal  Away          2.0   
2     2024-08-31_Arsenal_vs_Brighton  Arsenal  Home          1.0   
3    2024-09-15_Arsenal_vs_Tottenham  Arsenal  Away          1.0   
4  2024-09-19_Arsenal_vs_it Atalanta  Arsenal  Away          0.0   

   butsconcedes resultat   XG  XGA  possession        capitaine formation  
0           0.0        W  1.2  0.5        53.0  Martin Ødegaard     4-3-3  
1           0.0        W  0.9  1.2        60.0  Martin Ødegaard     4-3-3  
2           1.0        D  2.1  1.7        36.0  Martin Ødegaard     4-3-3  
3           0.0        W  0.7  0.7        37.0         Jorginho     4-4-2  
4           0.0        D  0.8  1.2        46.0    Gabriel Jesus     4-3-3  
Nombre total de matchs : 975
✅ DataFrame enregistrée dans ../../data/gold\matches_resultat_unique.csv


In [2]:
import os
import pandas as pd

SILVER_PATH = "../../data/silver"

# DataFrame final
df_final_result_match = pd.DataFrame(columns=[
    'match', 'equipe', 'venue', 'butsmarques', 'butsconcedes',
    'resultat', 'XG', 'XGA', 'possession', 'capitaine', 'formation'
])

# Dictionnaire de conversion des résultats
result_mapping = {
    'W': 'Victoire',
    'L': 'Défaite',
    'D': 'Nul'
}

teams = [d for d in os.listdir(SILVER_PATH) if os.path.isdir(os.path.join(SILVER_PATH, d))]

for team_dir in teams:
    team_name = team_dir.replace('_', ' ')
    file_path = os.path.join(SILVER_PATH, team_dir, f"{team_dir}_matches_clean.csv")

    if not os.path.exists(file_path):
        continue

    df = pd.read_csv(file_path)
    df['team'] = team_name

    for _, row in df.iterrows():
        eq1 = row['team']
        eq2 = row['Opponent']
        date = str(row['Date'])
        match_id = f"{date}_{eq1}_vs_{eq2}"

        resultat_fr = result_mapping.get(str(row['Result']).strip(), 'Inconnu')

        df_final_result_match = pd.concat([df_final_result_match, pd.DataFrame([{
            'match': match_id,
            'equipe': eq1,
            'venue': row['Venue'],
            'butsmarques': row['GF'],
            'butsconcedes': row['GA'],
            'resultat': resultat_fr,
            'XG': row['xG'],
            'XGA': row['xGA'],
            'possession': row['Poss'],
            'capitaine': row['Captain'],
            'formation': row['Formation']
        }])], ignore_index=True)

print(df_final_result_match.head())
print("Nombre total de matchs :", len(df_final_result_match))

# Enregistrement dans dossier gold
GOLD_PATH = "../../data/gold"
os.makedirs(GOLD_PATH, exist_ok=True)

file_path = os.path.join(GOLD_PATH, "matches_resultat_unique.csv")
df_final_result_match.to_csv(file_path, index=False, encoding='utf-8-sig')
print(f"✅ DataFrame enregistrée dans {file_path}")


  df_final_result_match = pd.concat([df_final_result_match, pd.DataFrame([{


                               match   equipe venue  butsmarques  \
0       2024-08-17_Arsenal_vs_Wolves  Arsenal  Home          2.0   
1  2024-08-24_Arsenal_vs_Aston Villa  Arsenal  Away          2.0   
2     2024-08-31_Arsenal_vs_Brighton  Arsenal  Home          1.0   
3    2024-09-15_Arsenal_vs_Tottenham  Arsenal  Away          1.0   
4  2024-09-19_Arsenal_vs_it Atalanta  Arsenal  Away          0.0   

   butsconcedes  resultat   XG  XGA  possession        capitaine formation  
0           0.0  Victoire  1.2  0.5        53.0  Martin Ødegaard     4-3-3  
1           0.0  Victoire  0.9  1.2        60.0  Martin Ødegaard     4-3-3  
2           1.0       Nul  2.1  1.7        36.0  Martin Ødegaard     4-3-3  
3           0.0  Victoire  0.7  0.7        37.0         Jorginho     4-4-2  
4           0.0       Nul  0.8  1.2        46.0    Gabriel Jesus     4-3-3  
Nombre total de matchs : 975
✅ DataFrame enregistrée dans ../../data/gold\matches_resultat_unique.csv
