In [1]:
import requests
import pandas as pd
import unicodedata
import time
import os
import asyncio
from understat import Understat
import aiohttp
from datetime import timedelta
import shutil

In [None]:
# Fetch the data from the FPL API
url = "https://fantasy.premierleague.com/api/bootstrap-static/"
response = requests.get(url)
data = response.json()

# Organize the main data categories into DataFrames
elements_df = pd.DataFrame(data['elements'])  # Player statistics and details
element_types_df = pd.DataFrame(data['element_types'])  # Player positions
teams_df = pd.DataFrame(data['teams'])  # Team information
events_df = pd.DataFrame(data['events'])  # Gameweeks
game_settings_df = pd.DataFrame([data['game_settings']])  # Game settings
phases_df = pd.DataFrame(data['phases'])  # Season phases

# Clean accents from 'First Name' and 'Second Name' in elements_df
def remove_accents(text):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

# Apply accent removal
elements_df['first_name'] = elements_df['first_name'].apply(remove_accents)
elements_df['second_name'] = elements_df['second_name'].apply(remove_accents)

elements_df.to_csv("fpl_elements.csv", index=False)

# Renaming columns in elements_df to more descriptive names for convenience
elements_df = elements_df.rename(columns={
    'team': 'Team ID',
    'element_type': 'Position ID',
    'total_points': 'Total Points',
    'now_cost': 'Current Price',
    'selected_by_percent': 'Selected By (%)'
})

# Merge elements_df with teams_df to get team names for players
elements_with_teams = elements_df.merge(teams_df[['id', 'name']], how='left', left_on='Team ID', right_on='id')
elements_with_teams.rename(columns={'name': 'Team Name'}, inplace=True)

# Drop redundant 'id' column only if it exists
if 'id' in elements_with_teams.columns:
    elements_with_teams.drop(columns=['id'], inplace=True)

# Merge elements_with_teams with element_types_df to get position names for players
elements_final = elements_with_teams.merge(element_types_df[['id', 'singular_name']], how='left', left_on='Position ID', right_on='id')
elements_final.rename(columns={'singular_name': 'Position Name'}, inplace=True)

# Drop redundant 'id' column from element_types_df merge only if it exists
if 'id' in elements_final.columns:
    elements_final.drop(columns=['id'], inplace=True)

# Display the final DataFrame structure
print(elements_final.head())


   chance_of_playing_next_round  chance_of_playing_this_round    code  \
0                           0.0                           0.0  438098   
1                         100.0                         100.0  205651   
2                         100.0                          75.0  226597   
3                         100.0                         100.0  219847   
4                           0.0                           0.0  463748   

   cost_change_event  cost_change_event_fall  cost_change_start  \
0                  0                       0                 -1   
1                  0                       0                 -2   
2                  0                       0                  2   
3                  0                       0                  1   
4                  0                       0                  0   

   cost_change_start_fall  dreamteam_count  Position ID ep_next  ...  \
0                       1                0            3     0.0  ...   
1             

In [None]:
# Load the elements data to get player names and IDs
elements_df = pd.read_csv("fpl_elements.csv")

# Select only the necessary columns for merging later
elements_df = elements_df[['id', 'first_name', 'second_name']]

# Initialize an empty list to collect data for each player per gameweek
all_gameweek_data = []

# Loop through each player's ID in elements_df
for _, row in elements_df.iterrows():
    player_id = row['id']
    first_name = row['first_name']
    last_name = row['second_name']
    
    # Fetch gameweek data for each player
    url = f"https://fantasy.premierleague.com/api/element-summary/{player_id}/"
    response = requests.get(url)
    
    if response.status_code == 200:
        player_data = response.json()
        
        # Extract each gameweek's data and add it to our list
        for gameweek in player_data['history']:
            # Add player ID, first name, and last name to each gameweek record
            gameweek['player_id'] = player_id
            gameweek['first_name'] = first_name
            gameweek['last_name'] = last_name
            all_gameweek_data.append(gameweek)
    
    # Optional: Sleep to avoid rate limiting
    time.sleep(0.5)

# Convert the collected gameweek data into a DataFrame
gameweek_df = pd.DataFrame(all_gameweek_data)

# Rename columns for clarity where applicable
column_renames = {
    'round': 'Gameweek',
    'team_h_score': 'Goals_for_Home_Team',
    'team_a_score': 'Goals_for_Away_Team',
    'bonus': 'Bonus_Points',
    'value': 'Market_Price',
    'ict_index': 'Influence_Creativity_Threat_Index',
    'influence': 'Influence_Score',
    'creativity': 'Creativity Score',
    'threat': 'Threat Score',
}

gameweek_df.rename(columns=column_renames, inplace=True)

# Display the first few rows to confirm
print(gameweek_df.head())


   element  fixture  opponent_team  total_points  was_home  \
0        1        2             20             0      True   
1        1       11              2             0     False   
2        1       21              5             0      True   
3        1       39             18             0     False   
4        1       47             13             0     False   

           kickoff_time  Goals_for_Home_Team  Goals_for_Away_Team  Gameweek  \
0  2024-08-17T14:00:00Z                  2.0                  0.0         1   
1  2024-08-24T16:30:00Z                  0.0                  2.0         2   
2  2024-08-31T11:30:00Z                  1.0                  1.0         3   
3  2024-09-15T13:00:00Z                  0.0                  1.0         4   
4  2024-09-22T15:30:00Z                  2.0                  2.0         5   

   minutes  ...  expected_goal_involvements  expected_goals_conceded  \
0        0  ...                        0.00                     0.00   
1       

In [22]:
# Create csv files
# Seasonal Player Data

# Optionally, save each DataFrame to CSV for further analysis
#elements_df.to_csv("fpl_elements.csv", index=False)
element_types_df.to_csv("fpl_element_types.csv", index=False)
teams_df.to_csv("fpl_teams.csv", index=False)
events_df.to_csv("fpl_events.csv", index=False)
game_settings_df.to_csv("fpl_game_settings.csv", index=False)
phases_df.to_csv("fpl_phases.csv", index=False)


# Optionally, save the DataFrame to a CSV file
# Weekly Player Data
gameweek_df.to_csv("FPL_Gameweek_Data.csv", index=False)

In [None]:
# List all CSV files in the current directory
csv_files = [f for f in os.listdir() if f.endswith('.csv')]

# Dictionary to store columns for each DataFrame
file_columns = {}

# Load each CSV file and store the column names
for file in csv_files:
    try:
        df = pd.read_csv(file, encoding='ISO-8859-1')  # Specify encoding
        file_columns[file] = set(df.columns)
        print(df.shape[0])
    except UnicodeDecodeError:
        print(f"Could not read {file} due to encoding issues.")

# Find columns shared by all files (intersection of all column sets)
shared_columns = set.intersection(*file_columns.values())

# Print shared columns
print("Columns shared by all files:")
print(shared_columns)

# Print unique columns for each file
print("\nUnique columns in each file:")
for file, columns in file_columns.items():
    unique_columns = columns - shared_columns
    print(f"{file}: {unique_columns}")



29725
25447
21790
23679
22560
24365
26505
5878
22467
Columns shared by all files:
{'fixture', 'saves', 'name', 'bonus', 'bps', 'penalties_saved', 'yellow_cards', 'team_a_score', 'goals_conceded', 'value', 'opponent_team', 'goals_scored', 'was_home', 'creativity', 'assists', 'transfers_out', 'transfers_balance', 'team_h_score', 'kickoff_time', 'selected', 'influence', 'red_cards', 'total_points', 'transfers_in', 'GW', 'penalties_missed', 'own_goals', 'ict_index', 'element', 'round', 'minutes', 'clean_sheets', 'threat'}

Unique columns in each file:
merged_gw_2324.csv: {'expected_goals_conceded', 'expected_goals', 'starts', 'xP', 'position', 'team', 'expected_goal_involvements', 'expected_assists'}
merged_gw_2122.csv: {'position', 'xP', 'team'}
merged_gw_1819.csv: {'attempted_passes', 'errors_leading_to_goal_attempt', 'big_chances_missed', 'big_chances_created', 'open_play_crosses', 'ea_index', 'penalties_conceded', 'clearances_blocks_interceptions', 'winning_goals', 'tackles', 'errors_l

In [None]:
async def fetch_understat_data(season):
    async with aiohttp.ClientSession() as session:
        understat = Understat(session)
        
        # Fetch player data for the specified season in the Premier League
        players_data = await understat.get_league_players("epl", season)
        
        # Create an empty list to hold the player gameweek data
        player_gameweek_data = []

        for player in players_data:
            player_id = player['id']
            player_name = player['player_name']
            
            # Fetch player matches (gameweek data) for each player in the season
            matches = await understat.get_player_matches(player_id)
            
            # Check the structure of a single match
            if matches:
                print(f"Sample match data for {player_name} in {season}:", matches[0])
                break

# Run this code to view the structure of a match data item
await fetch_understat_data("2020")

Sample match data for Harry Kane in 2020: {'goals': '2', 'shots': '3', 'xG': '1.38821542263031', 'time': '90', 'position': 'FW', 'h_team': 'Bayern Munich', 'a_team': 'Union Berlin', 'h_goals': '3', 'a_goals': '0', 'date': '2024-11-02', 'id': '27815', 'season': '2024', 'roster_id': '684787', 'xA': '0.8060860633850098', 'assists': '1', 'key_passes': '4', 'npg': '1', 'npxG': '0.6304387450218201', 'xGChain': '1.6077451705932617', 'xGBuildup': '0.1712203174829483'}


In [None]:
async def fetch_understat_data(season):
    async with aiohttp.ClientSession() as session:
        understat = Understat(session)
        
        # Fetch player data for the specified season in the Premier League
        players_data = await understat.get_league_players("epl", season)
        
        # Create an empty list to hold the player gameweek data
        player_gameweek_data = []

        for player in players_data:
            player_id = player['id']
            player_name = player['player_name']
            
            # Fetch player matches (gameweek data) for each player in the season
            matches = await understat.get_player_matches(player_id)
            
            # Extract xG, xA, and calculate expected goal involvements
            for match in matches:
                if match['season'] == season:
                    gameweek_data = {
                        "player_name": player_name,
                        "date": match['date'],
                        "xG": float(match['xG']),
                        "xA": float(match['xA']),
                        "expected_goal_involvements": float(match['xG']) + float(match['xA'])
                    }
                    player_gameweek_data.append(gameweek_data)
        
        # Convert the list to a DataFrame for easy manipulation
        return pd.DataFrame(player_gameweek_data)

# Fetch data for the seasons from 2016/17 to 2021/22
async def main():
    seasons = ["2016", "2017", "2018", "2019", "2020", "2021"]
    for season in seasons:
        print(f"Fetching data for the {season}/{str(int(season) + 1)[-2:]} season...")
        try:
            season_data = await fetch_understat_data(season)
            season_data.to_csv(f"understat_{season}_{str(int(season) + 1)[-2:]}.csv", index=False)
            print(f"Data for {season}/{str(int(season) + 1)[-2:]} saved successfully.")
        except Exception as e:
            print(f"Failed to fetch data for {season}/{str(int(season) + 1)[-2:]}. Error: {e}")
        
        # Wait a few seconds between each season request to avoid overloading the server
        await asyncio.sleep(5)  # Adjust this delay as needed

# Run the asynchronous main function
await main()



Fetching data for the 2016/17 season...
Data for 2016/17 saved successfully.
Fetching data for the 2017/18 season...
Data for 2017/18 saved successfully.
Fetching data for the 2018/19 season...
Data for 2018/19 saved successfully.
Fetching data for the 2019/20 season...
Data for 2019/20 saved successfully.
Fetching data for the 2020/21 season...
Data for 2020/21 saved successfully.
Fetching data for the 2021/22 season...
Data for 2021/22 saved successfully.


Combine Understat data (xG, xA, xGI) with merged_gw files from Vaastav. These will be matched by gameweek, not specific individual date, so if Vaastav files contain multiple rows for a single player in a specific game week, but only one corresponding entry in Understat, the values for the whole gameweek will be imputed with the most chronologically-appropriate estimate from Understat.

In [2]:
def normalize_name(name):
    """Normalize player names by removing special characters and suffixes."""
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))
    if "_" in name:
        # Remove suffix numbers, e.g., '_191'
        name = '_'.join(name.split('_')[:2])
    return name.replace("_", " ")  # Replace underscores with spaces

def assign_gameweeks(understat_df):
    """Assign gameweeks based on the date column in the understat data."""
    # Sort dates chronologically
    understat_df['date'] = pd.to_datetime(understat_df['date'])
    understat_df = understat_df.sort_values(by='date').reset_index(drop=True)

    # Initialize gameweek assignment
    gameweeks = []
    current_gw_start = understat_df['date'].iloc[0]
    current_gw_end = current_gw_start + timedelta(days=6)
    current_gw = 1

    for game_date in understat_df['date']:
        if game_date > current_gw_end:
            # Start a new gameweek
            current_gw_start = game_date
            current_gw_end = current_gw_start + timedelta(days=6)
            current_gw += 1
        gameweeks.append(current_gw)

    understat_df['gameweek'] = gameweeks
    return understat_df

def merge_understat_with_merged_gw(season_start, season_end):
    """Merge understat data with merged_gw data for a given season."""
    understat_file = f"understat_{season_start}_{season_end}.csv"
    merged_gw_file = f"merged_gw_{season_start[-2:]}{season_end[-2:]}.csv"
    output_file = f"20{season_start}-20{season_end} season data.csv"

    # Load the data with proper encoding
    understat_df = pd.read_csv(understat_file, encoding='ISO-8859-1')
    merged_gw_df = pd.read_csv(merged_gw_file, encoding='ISO-8859-1')

    # Normalize names in both datasets
    merged_gw_df['name'] = merged_gw_df['name'].apply(normalize_name)
    understat_df['player_name'] = understat_df['player_name'].apply(normalize_name)

    # Assign gameweeks based on understat dates
    understat_df = assign_gameweeks(understat_df)

    # Convert kickoff_time to date format (YYYY-MM-DD)
    merged_gw_df['kickoff_date'] = pd.to_datetime(merged_gw_df['kickoff_time']).dt.date
    understat_df['date'] = understat_df['date'].dt.date

    # Merge data on name and gameweek
    combined_df = pd.merge(
        merged_gw_df,
        understat_df,
        left_on=['name', 'kickoff_date'],
        right_on=['player_name', 'date'],
        how='left'
    )

    # Drop redundant columns
    combined_df.drop(['player_name', 'date', 'kickoff_date'], axis=1, inplace=True)

    # Save the combined data
    combined_df.to_csv(output_file, index=False, na_rep='NaN')
    print(f"Saved combined data for 20{season_start}-20{season_end} season to {output_file}")

    # Report the number of NaN rows in xA, xG, and xGI columns
    nan_counts = combined_df[['xA', 'xG', 'expected_goal_involvements']].isna().sum()
    print(f"Season 20{season_start}-20{season_end}: NaN counts - xA: {nan_counts['xA']}, xG: {nan_counts['xG']}, xGI: {nan_counts['expected_goal_involvements']}")


# List of seasons
seasons = [
    ("16", "17"),
    ("17", "18"),
    ("18", "19"),
    ("19", "20"),
    ("20", "21"),
    ("21", "22"),
]

# Process each season
for season_start, season_end in seasons:
    merge_understat_with_merged_gw(season_start, season_end)


Saved combined data for 2016-2017 season to 2016-2017 season data.csv
Season 2016-2017: NaN counts - xA: 15178, xG: 15178, xGI: 15178
Saved combined data for 2017-2018 season to 2017-2018 season data.csv
Season 2017-2018: NaN counts - xA: 13966, xG: 13966, xGI: 13966
Saved combined data for 2018-2019 season to 2018-2019 season data.csv
Season 2018-2019: NaN counts - xA: 13814, xG: 13814, xGI: 13814
Saved combined data for 2019-2020 season to 2019-2020 season data.csv
Season 2019-2020: NaN counts - xA: 13562, xG: 13562, xGI: 13562
Saved combined data for 2020-2021 season to 2020-2021 season data.csv
Season 2020-2021: NaN counts - xA: 15749, xG: 15749, xGI: 15749
Saved combined data for 2021-2022 season to 2021-2022 season data.csv
Season 2021-2022: NaN counts - xA: 16899, xG: 16899, xGI: 16899


In [3]:
# List of source and destination file names
files_to_rename = [
    ("merged_gw_2223.csv", "2022-2023 season data.csv"),
    ("merged_gw_2324.csv", "2023-2024 season data.csv"),
    ("merged_gw_2425.csv", "2024-2025 season data.csv"),
]

# Iterate through the list and copy/rename files
for source, destination in files_to_rename:
    try:
        shutil.copy(source, destination)
        print(f"Copied and renamed: {source} -> {destination}")
    except FileNotFoundError:
        print(f"File not found: {source}")
    except Exception as e:
        print(f"An error occurred while processing {source}: {e}")


Copied and renamed: merged_gw_2223.csv -> 2022-2023 season data.csv
Copied and renamed: merged_gw_2324.csv -> 2023-2024 season data.csv
Copied and renamed: merged_gw_2425.csv -> 2024-2025 season data.csv


In [4]:
s1617 = pd.read_csv('2016-2017 season data.csv')
s1718 = pd.read_csv('2017-2018 season data.csv')
s1819 = pd.read_csv('2018-2019 season data.csv')
s1920 = pd.read_csv('2019-2020 season data.csv')
s2021 = pd.read_csv('2020-2021 season data.csv')
s2122 = pd.read_csv('2021-2022 season data.csv')
s2223 = pd.read_csv('2022-2023 season data.csv')
s2324 = pd.read_csv('2023-2024 season data.csv')
s2425 = pd.read_csv('2024-2025 season data.csv')

In [5]:
season_list = [s1617, s1718, s1819, s1920, s2021, s2122, s2223, s2324, s2425]

for df in season_list:
    if 'xA' in df.columns:
        df.rename(columns={'xA': 'expected_assists'}, inplace=True)
    if 'xG' in df.columns:
        df.rename(columns={'xG': 'expected_goals'}, inplace=True)
    if 'ict_index' in df.columns:
        df.rename(columns={'ict_index': 'Influence_Creativity_Threat_Index'}, inplace=True)
    if 'gameweek' in df.columns:
        df.drop(columns=['gameweek'], inplace=True)

In [6]:
# Dictionary to store column names for each DataFrame
df_columns = {}

# Extract column names for each DataFrame
for i, df in enumerate(season_list):
    try:
        season_name = f"Season {16 + i}-{17 + i}"  # Create a season label
        df_columns[season_name] = set(df.columns)
    except Exception as e:
        print(f"An error occurred while processing {season_name}: {e}")

# Find columns shared by all DataFrames (intersection of all column sets)
shared_columns = set.intersection(*df_columns.values())

# Print shared columns
print("Shared columns across all DataFrames:")
print(shared_columns)
print("\n")

# Identify and print unique columns for each DataFrame
for season, columns in df_columns.items():
    unique_columns = columns - shared_columns
    print(f"Season: {season}")
    print(f"Unique columns: {unique_columns if unique_columns else 'None'}")
    print("\n")

Shared columns across all DataFrames:
{'team_h_score', 'transfers_out', 'assists', 'penalties_saved', 'name', 'influence', 'opponent_team', 'yellow_cards', 'fixture', 'clean_sheets', 'was_home', 'total_points', 'expected_goals', 'minutes', 'transfers_in', 'creativity', 'bps', 'penalties_missed', 'expected_goal_involvements', 'element', 'value', 'team_a_score', 'expected_assists', 'GW', 'own_goals', 'goals_conceded', 'selected', 'bonus', 'Influence_Creativity_Threat_Index', 'red_cards', 'saves', 'threat', 'round', 'kickoff_time', 'goals_scored', 'transfers_balance'}


Season: Season 16-17
Unique columns: {'errors_leading_to_goal', 'open_play_crosses', 'dribbles', 'loaned_out', 'big_chances_created', 'offside', 'clearances_blocks_interceptions', 'tackles', 'completed_passes', 'target_missed', 'ea_index', 'key_passes', 'fouls', 'kickoff_time_formatted', 'penalties_conceded', 'big_chances_missed', 'errors_leading_to_goal_attempt', 'id', 'attempted_passes', 'loaned_in', 'recoveries', 'tackl

In [7]:
for df in [s2223, s2324, s2425]:
    df.drop(columns=['expected_goals_conceded', 'starts'], inplace=True)


In [8]:
# Dictionary to store column names for each DataFrame
df_columns = {}
list2 = [s2021, s2122, s2223, s2324, s2425]

# Extract column names for each DataFrame
for i, df in enumerate(list2):
    try:
        season_name = f"Season {20 + i}-{21 + i}"  # Create a season label
        df_columns[season_name] = set(df.columns)
    except Exception as e:
        print(f"An error occurred while processing {season_name}: {e}")

# Find columns shared by all DataFrames (intersection of all column sets)
shared_columns = set.intersection(*df_columns.values())

# Print shared columns
print("Shared columns across all DataFrames:")
print(shared_columns)
print("\n")

# Identify and print unique columns for each DataFrame
for season, columns in df_columns.items():
    unique_columns = columns - shared_columns
    print(f"Season: {season}")
    print(f"Unique columns: {unique_columns if unique_columns else 'None'}")
    print("\n")

Shared columns across all DataFrames:
{'team_h_score', 'transfers_out', 'assists', 'penalties_saved', 'name', 'influence', 'opponent_team', 'yellow_cards', 'fixture', 'clean_sheets', 'was_home', 'total_points', 'expected_goals', 'minutes', 'transfers_in', 'team', 'bps', 'creativity', 'penalties_missed', 'expected_goal_involvements', 'position', 'element', 'value', 'expected_assists', 'team_a_score', 'GW', 'own_goals', 'goals_conceded', 'selected', 'bonus', 'Influence_Creativity_Threat_Index', 'red_cards', 'xP', 'saves', 'threat', 'round', 'kickoff_time', 'goals_scored', 'transfers_balance'}


Season: Season 20-21
Unique columns: None


Season: Season 21-22
Unique columns: None


Season: Season 22-23
Unique columns: None


Season: Season 23-24
Unique columns: None


Season: Season 24-25
Unique columns: None




In [9]:
# List of DataFrames to combine for training data
dataframes = [s2021, s2122, s2223, s2324]

# Combine all DataFrames into a single DataFrame
train = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a CSV file
train.to_csv("train.csv", index=False)

train.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,expected_goals,expected_assists,expected_goal_involvements
0,Aaron Connolly,FWD,Brighton,0.5,0,0,-3,0,0.3,78,...,0,0,0,55,True,0,1,0.392763,0.0,0.392763
1,Aaron Cresswell,DEF,West Ham,2.1,0,0,11,0,11.2,435,...,0,0,0,50,True,0,1,0.0,0.0,0.0
2,Aaron Mooy,MID,Brighton,0.0,0,0,0,0,0.0,60,...,0,0,0,50,True,0,1,,,
3,Aaron Ramsdale,GK,Sheffield Utd,2.5,0,0,12,0,0.0,483,...,0,0,0,50,True,0,1,0.0,0.0,0.0
4,Abdoulaye DoucourA©,MID,Everton,1.3,0,0,20,1,44.6,512,...,0,0,0,55,False,0,1,0.0,0.205708,0.205708


In [11]:
print(train.shape)
print(train.isna().any(axis=1).sum())


(106042, 39)
32648


In [12]:
# List of DataFrames to combine
dataframes_old = [s1617, s1718, s1819]

# Combine all DataFrames into a single DataFrame
train_old = pd.concat(dataframes_old, ignore_index=True)

# Save the combined DataFrame to a CSV file
train_old.to_csv("train_old.csv", index=False)

train_old.head()

Unnamed: 0,name,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,...,transfers_in,transfers_out,value,was_home,winning_goals,yellow_cards,GW,expected_goals,expected_assists,expected_goal_involvements
0,Aaron Cresswell,0,0,0,0,0,0,0,0,0,...,0,0,55,False,0,0,1,,,
1,Aaron Lennon,0,3,0,0,0,6,0,1,2,...,0,0,60,True,0,0,1,0.0,0.0,0.0
2,Aaron Ramsey,0,26,0,0,0,5,0,2,22,...,0,0,80,True,0,0,1,0.076822,0.0,0.076822
3,Abdoulaye Doucoure,0,0,0,0,0,0,0,0,0,...,0,0,50,False,0,0,1,,,
4,Abdul Rahman Baba,0,0,0,0,0,0,0,0,0,...,0,0,55,True,0,0,1,,,


In [13]:
print(train_old.shape)
print(train_old.isna().any(axis=1).sum())

(67936, 59)
42958


In [14]:
test = s2425.copy()
test.to_csv("test.csv", index=False)
test.head()

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW
0,Alex Scott,MID,Bournemouth,1.6,0,0,11,0,12.8,77,...,1,0.0,2,0,0,0,50,False,0,1
1,Carlos Miguel dos Santos Pereira,GK,Nott'm Forest,2.2,0,0,0,0,0.0,427,...,1,0.0,0,0,0,0,45,True,0,1
2,Tomiyasu Takehiro,DEF,Arsenal,0.0,0,0,0,0,0.0,22,...,2,0.0,0,0,0,0,50,True,0,1
3,Malcolm Ebiowei,MID,Crystal Palace,0.0,0,0,0,0,0.0,197,...,2,0.0,0,0,0,0,45,False,0,1
4,Ben Brereton Díaz,MID,Southampton,1.0,0,0,-2,0,14.0,584,...,1,16.0,1,0,0,0,55,False,1,1


In [15]:
print(test.shape)
print(test.isna().any(axis=1).sum())

(5878, 39)
0
