In [None]:
# import libraries
import pandas as pd

# load data
profiles = pd.read_csv('../csv/19.01.2026/profiles.csv')
predictions = pd.read_csv('../csv/10.02.2026/predictions_rows.csv')
fixtures = pd.read_csv('../csv/10.02.2026/fixtures_rows(1).csv')

# Merge predictions with profiles on user_id
merged = predictions.merge(profiles, left_on='user_id', right_on='id', how='left', suffixes=('_pred', '_profile'))

# Merge with fixtures on fixture_id
merged = merged.merge(fixtures, left_on='fixture_id', right_on='id', how='left', suffixes=('', '_fixture'))
df_shape = merged[["username", "club", "home_team", "away_team", "home_prediction", "away_prediction", "home_score", "away_score", "game_week_id"]]

# remove inactive players
df_players = df_shape.drop(df_shape[df_shape["username"] == "Martinez"].index)

# remove non premier league weeks
df_game_weeks = df_players.drop(df_players[df_players["game_week_id"].isin(["7d6130ac-980a-4352-8ec0-29a06643525e", "ef99d1e9-75a3-4c2a-9ab3-48604f86b4fa"])].index)

# convert all numbers to integers
cols_to_convert = ["home_score", "away_score", "home_prediction", "away_prediction"]

df_game_weeks[cols_to_convert] = df_game_weeks[cols_to_convert].astype(int)

df_final = df_game_weeks

# check the final dataframe
print (df_final.shape)
print (df_final.head())
print (df_final.dtypes)

(7480, 9)
       username               club        home_team          away_team  \
0   üç∫The Barman  Tottenham Hotspur  AFC Bournemouth            Arsenal   
1     Stephen O            Arsenal     Leeds United  Tottenham Hotspur   
2   Jim Shirley           West Ham          Everton          Brentford   
3  Steve arnold              Spurs         West Ham            Chelsea   
4    Mjd-‚öíÔ∏è‚öíÔ∏è‚öíÔ∏è   West Ham  united          Burnley            Arsenal   

   home_prediction  away_prediction  home_score  away_score  \
0                0                2           2           3   
1                1                2           1           2   
2                1                1           2           4   
3                1                3           1           5   
4                0                2           0           2   

                           game_week_id  
0  b8e9c830-4504-4e15-9aba-2cf1e3b2de9d  
1  d77ab4fc-0b5e-46dc-8403-2c02eb802dd6  
2  b8e9c830-4504-4e15-9ab

In [29]:
# clean up team names

team_fixes = {
    # Brighton
    "Brighton": "Brighton & Hove Albion",
    "Brighton and Hove Albion": "Brighton & Hove Albion",

    # Bournemouth
    "Bournemouth": "AFC Bournemouth",

    # Wolves
    "Wolves": "Wolverhampton Wanderers",

    # Man United
    "Man Utd": "Manchester United",
    "Man U": "Manchester United",
    "Manchester Utd": "Manchester United",
    "Manchester United ": "Manchester United",

    # Man City
    "Man City": "Manchester City",

    # Aston Villa
    "Aston Villa ": "Aston Villa",

    # West Ham
    "West Ham": "West Ham United",
    " West Ham": "West Ham United",

    # Crystal Palace
    " Crystal Palace": "Crystal Palace",

    # Brentford
    " Brentford": "Brentford",

    # Leeds
    "Leeds": "Leeds United",
    " Leeds": "Leeds United",

    # Newcastle
    "Newcastle": "Newcastle United",

    # Nottingham Forest
    "Nottm Forest": "Nottingham Forest",

    # Tottenham
    "Tottenham": "Tottenham Hotspur",
    " Tottenham Hotspur": "Tottenham Hotspur",

    # Chelsea
    " Chelsea": "Chelsea"
}

# apply fixes
df_final["home_team"] = df_final["home_team"].replace(team_fixes)
df_final["away_team"] = df_final["away_team"].replace(team_fixes)

# show unique values
print("Home")
print(df_final["home_team"].nunique())
print(sorted(df_final["home_team"].unique()))

print("\nAway")
print(df_final["away_team"].nunique())
print(sorted(df_final["away_team"].unique()))

Home
20
['AFC Bournemouth', 'Arsenal', 'Aston Villa', 'Brentford', 'Brighton & Hove Albion', 'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leeds United', 'Liverpool', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Sunderland', 'Tottenham Hotspur', 'West Ham United', 'Wolverhampton Wanderers']

Away
20
['AFC Bournemouth', 'Arsenal', 'Aston Villa', 'Brentford', 'Brighton & Hove Albion', 'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leeds United', 'Liverpool', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Sunderland', 'Tottenham Hotspur', 'West Ham United', 'Wolverhampton Wanderers']


In [32]:
print(df_final.head())
print("home team counts")
print(df_final["home_team"].value_counts())
print("away team counts")
print(df_final["away_team"].value_counts())
print("game week counts")
print(df_final["game_week_id"].value_counts())

       username               club        home_team          away_team  \
0   üç∫The Barman  Tottenham Hotspur  AFC Bournemouth            Arsenal   
1     Stephen O            Arsenal     Leeds United  Tottenham Hotspur   
2   Jim Shirley           West Ham          Everton          Brentford   
3  Steve arnold              Spurs  West Ham United            Chelsea   
4    Mjd-‚öíÔ∏è‚öíÔ∏è‚öíÔ∏è   West Ham  united          Burnley            Arsenal   

   home_prediction  away_prediction  home_score  away_score  \
0                0                2           2           3   
1                1                2           1           2   
2                1                1           2           4   
3                1                3           1           5   
4                0                2           0           2   

                           game_week_id  
0  b8e9c830-4504-4e15-9aba-2cf1e3b2de9d  
1  d77ab4fc-0b5e-46dc-8403-2c02eb802dd6  
2  b8e9c830-4504-4e15-9aba-2cf1e3b2