In [5]:
import pandas as pd
import numpy as np
import re

In [28]:
defending_df = pd.read_csv('uncleaned_data_csv/defending.csv', header=None)
passing_df = pd.read_csv('uncleaned_data_csv/passing.csv', header=None)

## Quick glance at defending csv

In [9]:
defending_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,,,,,,,Rk,Player,Nation,Position,...,Tackles (Def 3rd),Tackles (Mid 3rd),Tackles (Att 3rd),Dribblers Tackled,Dribbles Challenged,% of Dribblers Tackled,Challenges Lost,Blocks,Shots Blocked,Season
1,1.0,Patrick van Aanholt,nl NED,DF,Crystal Palace,eng Premier League,26,1990,24.3,47,...,18,24,5,19,47,94,64,2,Matches,2017-2018
2,2.0,Rolando Aarons,eng ENG,"MF,FW",Newcastle Utd,eng Premier League,21,1995,1.5,4,...,2,3,0,3,1,5,0,0,Matches,2017-2018
3,3.0,Rolando Aarons,eng ENG,"MF,FW",Hellas Verona,it Serie A,21,1995,5.7,13,...,6,3,1,2,2,15,0,0,Matches,2017-2018
4,4.0,Ignazio Abate,it ITA,DF,Milan,it Serie A,30,1986,11.7,20,...,5,22,2,20,8,28,29,0,Matches,2017-2018


## Quick glance at passing csv

In [10]:
passing_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,,,,,,,,,,Rk,...,Passes Attempted (Short),Pass Completion % (Short),Passes Completed (Medium),Passes Attempted (Medium),Pass Completion % (Medium),Passes Completed (Long),Passes Attempted (Long),Pass Completion % (Long),Assists,Season
1,1.0,Patrick van Aanholt,nl NED,DF,Crystal Palace,eng Premier League,26.0,1990.0,24.3,884,...,2.1,1.8,-1.1,18,63,28,6,92,Matches,2017-2018
2,2.0,Rolando Aarons,eng ENG,"MF,FW",Newcastle Utd,eng Premier League,21.0,1995.0,1.5,29,...,0.0,0.0,0.0,0,2,1,1,3,Matches,2017-2018
3,3.0,Rolando Aarons,eng ENG,"MF,FW",Hellas Verona,it Serie A,21.0,1995.0,5.7,87,...,0.2,0.1,-0.2,3,8,7,1,17,Matches,2017-2018
4,4.0,Ignazio Abate,it ITA,DF,Milan,it Serie A,30.0,1986.0,11.7,625,...,0.5,0.8,-0.5,10,55,20,7,81,Matches,2017-2018


## Understanding the values in each column

In [11]:
for col in defending_df.columns:
    print(f"Column: {col}")
    print(defending_df[col].unique()[:10])  # Print first 10 unique values
    print("\n")

Column: 0
[nan '1' '2' '3' '4' '5' '6' '7' '8' '9']


Column: 1
[nan 'Patrick van Aanholt' 'Rolando Aarons' 'Ignazio Abate'
 'Aymen Abdennour' 'Aly Abeid' 'Mehdi Abeid' 'David Abraham'
 'Tammy Abraham' 'Amir Abrashi']


Column: 2
[nan 'nl NED' 'eng ENG' 'it ITA' 'tn TUN' 'mr MTN' 'dz ALG' 'ar ARG'
 'al ALB' 'gh GHA']


Column: 3
[nan 'DF' 'MF,FW' 'MF' 'FW' 'GK' 'DF,FW' 'Pos' 'DF,MF' 'FW,MF']


Column: 4
[nan 'Crystal Palace' 'Newcastle Utd' 'Hellas Verona' 'Milan' 'Marseille'
 'Levante' 'Dijon' 'Eint Frankfurt' 'Swansea City']


Column: 5
[nan 'eng Premier League' 'it Serie A' 'fr Ligue 1' 'es La Liga'
 'de Bundesliga' 'Comp']


Column: 6
['Rk' '26' '21' '30' '27' '19' '24' '31' '29' '25']


Column: 7
['Player' '1990' '1995' '1986' '1989' '1997' '1992' '1988' '1985' '1987']


Column: 8
['Nation' '24.3' '1.5' '5.7' '11.7' '5.5' '0.9' '13.1' '25.6' '19.2']


Column: 9
['Position' '47' '4' '13' '20' '7' '2' '23' '40' '10']


Column: 10
['Squad' '32' '4' '8' '17' '3' '1' '15' '6' '5']


Co

## Fixing column names in the table

In [11]:
def clean_header_rows(df, correct_column_idx):
    """
    Function to remove header rows within a dataset and reset the index.
    
    Parameters:
    df (pd.DataFrame): The input dataframe to be cleaned.
    correct_column_idx (int): The index of the row that contains the correct column names.
    
    Returns:
    pd.DataFrame: A cleaned DataFrame with internal header rows removed and reset index.
    """
    # Set the correct column names
    correct_column = df.iloc[correct_column_idx]

    #Drop the header column which is incorrect
    df = df.drop(0)
    
    # Drop the row that is intended to become the header
    df = df.drop(correct_column_idx)
    
    # Identify rows that match the correct columns
    header_rows = df.apply(lambda row: all(row == correct_column), axis=1)
    header_row_indices = df[header_rows].index
    
    # Drop the identified header rows
    cleaned_df = df.drop(header_row_indices)
    
    # Reset index
    cleaned_df = cleaned_df.reset_index(drop=True)
    
    # Assign the correct column names
    cleaned_df.columns = correct_column
    
    return cleaned_df

In [29]:
defending_df_cleaned = clean_header_rows(defending_df, 26)
defending_df_cleaned.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Tkl,...,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Matches,2017-2018
0,1,Patrick van Aanholt,nl NED,DF,Crystal Palace,eng Premier League,26,1990,24.3,47,...,18,24,5,19,47,94,64,2,Matches,2017-2018
1,2,Rolando Aarons,eng ENG,"MF,FW",Newcastle Utd,eng Premier League,21,1995,1.5,4,...,2,3,0,3,1,5,0,0,Matches,2017-2018
2,3,Rolando Aarons,eng ENG,"MF,FW",Hellas Verona,it Serie A,21,1995,5.7,13,...,6,3,1,2,2,15,0,0,Matches,2017-2018
3,4,Ignazio Abate,it ITA,DF,Milan,it Serie A,30,1986,11.7,20,...,5,22,2,20,8,28,29,0,Matches,2017-2018
4,5,Aymen Abdennour,tn TUN,DF,Marseille,fr Ligue 1,27,1989,5.5,7,...,3,5,5,0,4,11,20,0,Matches,2017-2018


In [30]:
passing_df_cleaned = clean_header_rows(passing_df, 26)
passing_df_cleaned.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Cmp,...,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP,Matches,2017-2018
0,1,Patrick van Aanholt,nl NED,DF,Crystal Palace,eng Premier League,26,1990,24.3,884,...,2.1,1.8,-1.1,18,63,28,6,92,Matches,2017-2018
1,2,Rolando Aarons,eng ENG,"MF,FW",Newcastle Utd,eng Premier League,21,1995,1.5,29,...,0.0,0.0,0.0,0,2,1,1,3,Matches,2017-2018
2,3,Rolando Aarons,eng ENG,"MF,FW",Hellas Verona,it Serie A,21,1995,5.7,87,...,0.2,0.1,-0.2,3,8,7,1,17,Matches,2017-2018
3,4,Ignazio Abate,it ITA,DF,Milan,it Serie A,30,1986,11.7,625,...,0.5,0.8,-0.5,10,55,20,7,81,Matches,2017-2018
4,5,Aymen Abdennour,tn TUN,DF,Marseille,fr Ligue 1,27,1989,5.5,310,...,0.0,0.0,0.0,0,8,0,0,12,Matches,2017-2018


## Dealing with repeated column names throughout the table

In [15]:
header_rows = defending_df.apply(lambda row: all(row == defending_df.columns), axis=1)
header_row_indices = defending_df[header_rows].index
header_rows
header_row_indices

Index([], dtype='int64')

In [12]:
def remove_internal_header_rows(df, header_row):
    """
    Removes rows in the DataFrame that are exact duplicates of the header row.

    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
        header_row (pd.Series or list): The row to match against (usually the real header).

    Returns:
        pd.DataFrame: Cleaned DataFrame with internal header rows removed.
    """
    header_rows = df.apply(lambda row: all(row == header_row), axis=1)
    return df[~header_rows].reset_index(drop=True)

In [16]:
# View unique entries in the 'Nation' column
print("Unique values in the Nation column:")
print(defending_df_cleaned['Nation'].unique())

# View unique entries in the 'Comp' column
print("\nUnique values in the Comp column:")
print(defending_df_cleaned['Comp'].unique())

Unique values in the Nation column:
['nl NED' 'eng ENG' 'it ITA' 'tn TUN' 'mr MTN' 'dz ALG' 'ar ARG' 'al ALB'
 'gh GHA' 'sct SCO' 'es ESP' 'bj BEN' 'de GER' 'fr FRA' 'iq IRQ' 'cd COD'
 'uy URU' 'ci CIV' 'br BRA' 'ma MAR' 'ch SUI' 'ng NGA' 'at AUT' 'cl CHI'
 'be BEL' 'cm CMR' 'wls WAL' 'py PAR' 'id IDN' 'sn SEN' 'dk DEN' 'ro ROU'
 'jm JAM' 'pt POR' 'ca CAN' 'se SWE' 'ie IRL' 'jp JPN' 'ga GAB' 'cg CGO'
 'gf GUF' 'co COL' 'hr CRO' 'ba BIH' 'cz CZE' 'mk MKD' 'gm GAM' 'rs SRB'
 'ao ANG' 'ht HAI' 'gp GLP' 'cv CPV' 'tg TOG' 'pl POL' 'si SVN' 'ly LBY'
 'sk SVK' 'cw CUW' 'bi BDI' 'is ISL' 'ml MLI' 'cr CRC' 'us USA' 'nir NIR'
 'li LIE' 'tr TUR' 'mq MTQ' 'ec ECU' 'gn GUI' 'ug UGA' 'pe PER' 'kr KOR'
 'ru RUS' 'no NOR' 've VEN' 'bf BFA' 'hu HUN' 'sa KSA' 'za RSA' 'gr GRE'
 'eg EGY' 'mx MEX' 'do DOM' 'bg BUL' 'nc NCL' 'xk KVX' 'il ISR' 'fi FIN'
 'ph PHI' 'gw GNB' 'md MDA' 'me MNE' 'sl SLE' 'la LAO' 'ee EST' 'ne NIG'
 'cf CTA' 'ua UKR' 'kp PRK' 'mg MAD' 'au AUS' 'hn HON' 'km COM' 'lu LUX'
 'gq EQG' '

In [17]:
# Identify all column names
columns = defending_df_cleaned.columns

# Find duplicate "Tkl" columns
tkl_indices = [i for i, col in enumerate(columns) if col == "Tkl"]

# If there are multiple "Tkl" columns, rename the second one
if len(tkl_indices) > 1:
    defending_df_cleaned.columns.values[tkl_indices[1]] = "DribTkl"

# Print to confirm renaming worked
print(defending_df_cleaned.columns)


Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s',
       'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'DribTkl', 'Att',
       'Tkl%', 'Lost', 'Blocks', 'Sh', 'Pass', 'Int', 'Tkl+Int', 'Clr', 'Err',
       'Matches'],
      dtype='object', name=26)


## Original version which works but specifically for defending.csv

In [13]:
### 2️⃣ Function to Rename Duplicate Columns (Handles "Tkl" Issue) ###
def rename_duplicate_columns(df):
    """
    Renames duplicate column names to ensure uniqueness.
    Specifically renames the second occurrence of 'Tkl' to 'DribTkl'.

    Parameters:
        df (pd.DataFrame): The DataFrame with potential duplicate columns.

    Returns:
        pd.DataFrame: Updated DataFrame with unique column names.
    """
    columns = df.columns
    tkl_indices = [i for i, col in enumerate(columns) if col == "Tkl"]
    
    if len(tkl_indices) > 1:
        df.columns.values[tkl_indices[1]] = "DribTkl"
    
    return df

## Version that is modular and can be used for any df. Column names must be supplied

In [14]:
def rename_duplicate_columns_prime(df, col_name, new_names=None):
    """
    Renames duplicate columns to ensure uniqueness.
    By default, appends _1, _2, etc. to duplicates, or uses new_names if provided.

    Parameters:
        df (pd.DataFrame): The DataFrame with potential duplicate columns.
        col_name (str): The column name you expect to be duplicated.
        new_names (list, optional): List of new names for each duplicate (must match number of duplicates).

    Returns:
        pd.DataFrame: Updated DataFrame with unique column names.
    """
    columns = list(df.columns)
    indices = [i for i, col in enumerate(columns) if col == col_name]
    if len(indices) > 1:
        if new_names and len(new_names) == len(indices):
            for idx, new_col in zip(indices, new_names):
                df.columns.values[idx] = new_col
        else:
            # Keep the first as is, rename the rest with suffixes
            for i, idx in enumerate(indices[1:], 1):
                df.columns.values[idx] = f"{col_name}_{i}"
    return df

In [2]:
print(type(defending_df_cleaned["Tkl"]))  # Should be a Series, NOT a DataFrame
print(defending_df_cleaned["Tkl"].head())  # See the first few values

#This check is making sure that the "Tkl" column in your cleaned DataFrame is a Series (a single column of data),
#not a DataFrame (which would mean there are still duplicate columns named "Tkl").

NameError: name 'defending_df_cleaned' is not defined

In [20]:
for col in defending_df_cleaned.columns:
    print(f"Column: {col}")
    print(defending_df_cleaned[col].unique()[:10])  # Print first 10 unique values
    print("\n")

Column: Rk
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10']


Column: Player
['Patrick van Aanholt' 'Rolando Aarons' 'Ignazio Abate' 'Aymen Abdennour'
 'Aly Abeid' 'Mehdi Abeid' 'David Abraham' 'Tammy Abraham' 'Amir Abrashi'
 'Francesco Acerbi']


Column: Nation
['nl NED' 'eng ENG' 'it ITA' 'tn TUN' 'mr MTN' 'dz ALG' 'ar ARG' 'al ALB'
 'gh GHA' 'sct SCO']


Column: Pos
['DF' 'MF,FW' 'MF' 'FW' 'GK' 'DF,FW' 'DF,MF' 'FW,MF' 'MF,DF' 'FW,DF']


Column: Squad
['Crystal Palace' 'Newcastle Utd' 'Hellas Verona' 'Milan' 'Marseille'
 'Levante' 'Dijon' 'Eint Frankfurt' 'Swansea City' 'Freiburg']


Column: Comp
['eng Premier League' 'it Serie A' 'fr Ligue 1' 'es La Liga'
 'de Bundesliga']


Column: Age
['26' '21' '30' '27' '19' '24' '31' '29' '25' '32']


Column: Born
['1990' '1995' '1986' '1989' '1997' '1992' '1988' '1985' '1987' '1998']


Column: 90s
['24.3' '1.5' '5.7' '11.7' '5.5' '0.9' '13.1' '25.6' '19.2' '9.4']


Column: Tkl
['47' '4' '13' '20' '7' '2' '23' '40' '10' '35']


Column: TklW
['32' '4'

In [21]:
# Count the frequency of each unique value in 'Nation_cleaned'
nation_value_counts = defending_df_cleaned['Nation'].value_counts()
print("\nFrequency of each unique value in Nation_cleaned:")
print(nation_value_counts)

# Count the frequency of each unique value in 'Comp_cleaned'
comp_value_counts = defending_df_cleaned['Comp'].value_counts()
print("\nFrequency of each unique value in Comp_cleaned:")
print(comp_value_counts)


Frequency of each unique value in Nation_cleaned:
Nation
es ESP     2837
fr FRA     2396
it ITA     1801
de GER     1656
eng ENG    1366
           ... 
mf SMN        1
la LAO        1
gt GUA        1
kp PRK        1
mt MLT        1
Name: count, Length: 135, dtype: int64

Frequency of each unique value in Comp_cleaned:
Comp
it Serie A            4192
es La Liga            4090
fr Ligue 1            3989
eng Premier League    3786
de Bundesliga         3506
Name: count, dtype: int64


In [22]:
print(defending_df_cleaned.dtypes)

26
Rk         object
Player     object
Nation     object
Pos        object
Squad      object
Comp       object
Age        object
Born       object
90s        object
Tkl        object
TklW       object
Def 3rd    object
Mid 3rd    object
Att 3rd    object
DribTkl    object
Att        object
Tkl%       object
Lost       object
Blocks     object
Sh         object
Pass       object
Int        object
Tkl+Int    object
Clr        object
Err        object
Matches    object
dtype: object


In [15]:
def keep_after_first_capital(s):
    """
    Extracts the substring starting from the first uppercase letter in a string.
    If the input is not a string, it returns an empty string.
    """
    if not isinstance(s, str):  # Check if the input is not a string
        return ''
    
    # Regular expression to match from the first uppercase letter to the end
    match = re.search(r'[A-Z].*', s)
    if match:
        return match.group(0)
    return ''

In [24]:
# Apply the function to 'Nation' and 'Comp' columns
defending_df_cleaned['Nation'] = defending_df_cleaned['Nation'].apply(keep_after_first_capital)
defending_df_cleaned['Comp'] = defending_df_cleaned['Comp'].apply(keep_after_first_capital)

# Display the cleaned columns
print(defending_df_cleaned[['Nation', 'Comp']].head(15))

26 Nation            Comp
0     NED  Premier League
1     ENG  Premier League
2     ENG         Serie A
3     ITA         Serie A
4     TUN         Ligue 1
5     MTN         La Liga
6     ALG         Ligue 1
7     ARG      Bundesliga
8     ENG  Premier League
9     ALB      Bundesliga
10    ITA         Serie A
11    GHA         Serie A
12    SCO  Premier League
13    ESP         La Liga
14    ESP         La Liga


In [16]:
def extract_clean_names(df):
    """
    Extracts country codes and league names from messy text.

    Parameters:
        df (pd.DataFrame): The DataFrame containing 'Nation' & 'Comp' columns.

    Returns:
        pd.DataFrame: Updated DataFrame with cleaned columns.
    """
    def keep_after_first_capital(s):
        if not isinstance(s, str):
            return ''
        match = re.search(r'[A-Z].*', s)
        return match.group(0) if match else ''
    
    df['Nation'] = df['Nation'].apply(keep_after_first_capital)
    df['Comp'] = df['Comp'].apply(keep_after_first_capital)
    
    return df

In [26]:
defending_df_cleaned.to_csv("defending_cleaned.csv", index=False)

In [27]:
print(defending_df_cleaned.dtypes)

26
Rk         object
Player     object
Nation     object
Pos        object
Squad      object
Comp       object
Age        object
Born       object
90s        object
Tkl        object
TklW       object
Def 3rd    object
Mid 3rd    object
Att 3rd    object
DribTkl    object
Att        object
Tkl%       object
Lost       object
Blocks     object
Sh         object
Pass       object
Int        object
Tkl+Int    object
Clr        object
Err        object
Matches    object
dtype: object


In [28]:
# If Tkl is still a DataFrame, select only the first column occurrence
if isinstance(defending_df_cleaned["Tkl"], pd.DataFrame):
    defending_df_cleaned["Tkl"] = defending_df_cleaned["Tkl"].iloc[:, 0]  # Keep only first occurrence

# Force numeric conversion on all intended columns
numeric_columns = ['Age', 'Born', '90s', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 
                   'Att 3rd', 'Tkl%', 'Lost', 'Blocks', 'Int', 'Tkl+Int', 'Clr', 'Err', 'Sh', 'Pass', 'Att', 'Matches']

for col in numeric_columns:
    if col in defending_df_cleaned.columns:
        defending_df_cleaned[col] = (
            defending_df_cleaned[col]
            .astype(str)
            .str.replace('[^\d.]', '', regex=True)  # Remove all non-numeric characters
            .replace('', '0')  # Replace empty values with 0
        )

# Convert everything to numeric and fill NaNs with 0
for col in numeric_columns:
    if col in defending_df_cleaned.columns:
        defending_df_cleaned[col] = pd.to_numeric(defending_df_cleaned[col], errors='coerce').fillna(0)

# Ensure 'Born' is integer
if 'Born' in defending_df_cleaned.columns:
    defending_df_cleaned['Born'] = defending_df_cleaned['Born'].astype(int)

# Print final data types
print(defending_df_cleaned.dtypes)


26
Rk          object
Player      object
Nation      object
Pos         object
Squad       object
Comp        object
Age          int64
Born         int32
90s        float64
Tkl          int64
TklW         int64
Def 3rd      int64
Mid 3rd      int64
Att 3rd      int64
DribTkl     object
Att          int64
Tkl%       float64
Lost         int64
Blocks       int64
Sh           int64
Pass         int64
Int          int64
Tkl+Int      int64
Clr          int64
Err          int64
Matches      int64
dtype: object


In [41]:
### 4️⃣ Function to Convert Numeric Columns Safely ###
def convert_numeric_columns(df, numeric_columns):
    """
    Converts specified columns to numeric values, handling errors.

    Parameters:
        df (pd.DataFrame): The DataFrame with columns needing conversion.
        numeric_columns (list): List of column names to convert.

    Returns:
        pd.DataFrame: Updated DataFrame with numeric conversions.
    """
    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('[^\d.]', '', regex=True).replace('', '0')
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    # Use pandas nullable integer type for 'Born'
    if 'Born' in df.columns:
        df['Born'] = pd.to_numeric(df['Born'], errors='coerce').astype('Int64')
    return df


In [30]:
print(defending_df_cleaned['Sh'].unique())  # Look for non-numeric values

[ 5  0  1  2  6 15 21  3 26  7  4 24 19 13 14  9 16 11 17  8 27 10 22 12
 18 44 29 45 25 23 59 58 36 20 47 37 32 28 40 38 42 35 34 30 43 39 33 31
 50 56 41 48 52 61 78]


In [31]:
print(defending_df_cleaned.head(5))

26 Rk               Player Nation    Pos           Squad            Comp  Age  \
0   1  Patrick van Aanholt    NED     DF  Crystal Palace  Premier League   26   
1   2       Rolando Aarons    ENG  MF,FW   Newcastle Utd  Premier League   21   
2   3       Rolando Aarons    ENG  MF,FW   Hellas Verona         Serie A   21   
3   4        Ignazio Abate    ITA     DF           Milan         Serie A   30   
4   5      Aymen Abdennour    TUN     DF       Marseille         Ligue 1   27   

26  Born   90s  Tkl  ...  Tkl%  Lost  Blocks  Sh Pass  Int  Tkl+Int  Clr  Err  \
0   1990  24.3   47  ...  47.1    18      24   5   19   47       94   64    2   
1   1995   1.5    4  ...  66.7     2       3   0    3    1        5    0    0   
2   1995   5.7   13  ...  14.3     6       3   1    2    2       15    0    0   
3   1986  11.7   20  ...  54.5     5      22   2   20    8       28   29    0   
4   1989   5.5    7  ...  57.1     3       5   5    0    4       11   20    0   

26  Matches  
0         0 

In [32]:
# Normalize metrics
defending_df_cleaned['Interceptions/90'] = defending_df_cleaned['Int'] / (defending_df_cleaned['90s'] / 90)
defending_df_cleaned['Tackles/90'] = defending_df_cleaned['Tkl'] / (defending_df_cleaned['90s'] / 90)
defending_df_cleaned['90s Played'] = defending_df_cleaned['90s'] / 90

In [19]:
### 5️⃣ Function to Normalize Metrics ###
def normalize_metrics(df):
    """
    Normalizes defensive metrics per 90 minutes.

    Parameters:
        df (pd.DataFrame): The DataFrame containing stats.

    Returns:
        pd.DataFrame: Updated DataFrame with normalized metrics.
    """
    df['Interceptions/90'] = df['Int'] / (df['90s'] / 90)
    df['Tackles/90'] = df['Tkl'] / (df['90s'] / 90)
    df['90s Played'] = df['90s'] / 90

    return df

In [18]:
def normalize_metrics_prime(df, per90_columns):
    """
    Normalizes specified columns per 90 minutes.

    Parameters:
        df (pd.DataFrame): The DataFrame containing stats.
        per90_columns (dict): Keys are columns to normalize, values are new column names.

    Returns:
        pd.DataFrame: Updated DataFrame with normalized metrics.
    """
    df['90s Played'] = df['90s'] / 90
    for col, new_col in per90_columns.items():
        if col in df.columns:
            df[new_col] = df[col] / (df['90s'] / 90)
    return df

In [34]:
print(defending_df_cleaned.dtypes)

26
Rk                   object
Player               object
Nation               object
Pos                  object
Squad                object
Comp                 object
Age                   int64
Born                  int32
90s                 float64
Tkl                   int64
TklW                  int64
Def 3rd               int64
Mid 3rd               int64
Att 3rd               int64
DribTkl              object
Att                   int64
Tkl%                float64
Lost                  int64
Blocks                int64
Sh                    int64
Pass                  int64
Int                   int64
Tkl+Int               int64
Clr                   int64
Err                   int64
Matches               int64
Interceptions/90    float64
Tackles/90          float64
90s Played          float64
dtype: object


In [None]:
### 6️⃣ Function to Process Data (Master Function) ###
def process_data(file_path, correct_column_idx, numeric_columns):
    """
    Master function to process and clean the dataset.

    Parameters:
        file_path (str): Path to the CSV file.
        correct_column_idx (int): Row index where correct headers are.
        numeric_columns (list): Columns that need to be numeric.

    Returns:
        pd.DataFrame: Fully cleaned and processed DataFrame.
    """
    df = pd.read_csv(file_path, header=None)
    df = clean_header_rows(df, correct_column_idx)
    df = remove_internal_header_rows(df, df.columns)
    df = rename_duplicate_columns_prime(df)
    df = extract_clean_names(df)
    df = convert_numeric_columns(df, numeric_columns)
    df = normalize_metrics(df)
    
    return df

In [36]:
def process_data_prime(file_path, correct_column_idx, numeric_columns, duplicate_columns_info=None, per90_columns=None):
    """
    Master function to process and clean the dataset.

    Parameters:
        file_path (str): Path to the CSV file.
        correct_column_idx (int): Row index where correct headers are.
        numeric_columns (list): Columns that need to be numeric.
        duplicate_columns_info (dict, optional): 
            Keys are column names you expect to be duplicated.
            Values are lists of new names for those columns (or None to use default suffixes).

    Returns:
        pd.DataFrame: Fully cleaned and processed DataFrame.
    """
    df = pd.read_csv(file_path, header=None)
    df = clean_header_rows(df, correct_column_idx)
    df = remove_internal_header_rows(df, df.columns)
    
    # Rename last column to 'Season'
    columns = list(df.columns)
    columns[-1] = "Season"
    df.columns = columns
    
    # Rename duplicate columns for all specified columns
    if duplicate_columns_info:
        for col_name, new_names in duplicate_columns_info.items():
            df = rename_duplicate_columns_prime(df, col_name, new_names)
    
    # Ensure all numeric columns are Series, not DataFrames
    for col in numeric_columns:
        if col in df.columns and isinstance(df[col], pd.DataFrame):
            df[col] = df[col].iloc[:, 0]
    
    df = extract_clean_names(df)
    df = convert_numeric_columns(df, numeric_columns)
    df = normalize_metrics_prime(df, per90_columns)
    
    return df

In [None]:
### 7️⃣ Run Cleaning Process on Both Datasets ###
numeric_columns_defending = ['Age', 'Born', '90s', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 
                   'Att 3rd', 'Tkl%', 'Lost', 'Blocks', 'Int', 'Tkl+Int', 
                   'Clr', 'Err', 'Sh', 'Pass', 'Att', 'Matches']

numeric_columns_passing = ["Cmp_Total", "Cmp_Short", "Cmp_Medium", "Cmp_Long",
    "Att_Total", "Att_Short", "Att_Medium", "Att_Long",
    "90s"]

duplicate_columns_def = {"Tkl": ["Tkl", "DribTkl"]}

duplicate_columns_passing = {
    "Cmp":   ["Cmp_Total", "Cmp_Short", "Cmp_Medium", "Cmp_Long"],
    "Att":   ["Att_Total", "Att_Short", "Att_Medium", "Att_Long"],
    "Cmp%":  ["Cmp%_Total", "Cmp%_Short", "Cmp%_Medium", "Cmp%_Long"]
}

per90_cols_passing = {
    "Cmp_Total": "Cmp_Total/90",
    "Cmp_Short": "Cmp_Short/90",
    "Cmp_Medium": "Cmp_Medium/90",
    "Cmp_Long": "Cmp_Long/90",
    "Att_Total": "Att_Total/90",
    "Att_Short": "Att_Short/90",
    "Att_Medium": "Att_Medium/90",
    "Att_Long": "Att_Long/90"
}

per90_cols_defending = {
    "Int": "Interceptions/90",
    "Tkl": "Tackles/90",
    "90s": "90s Played"
}

In [37]:
# Process Defending Data
defending_df_cleaned = process_data('uncleaned_data_csv/defending.csv', 26, numeric_columns)
# Print final data types to confirm
print(defending_df_cleaned.dtypes)

26
Rk                   object
Player               object
Nation               object
Pos                  object
Squad                object
Comp                 object
Age                   int64
Born                  int32
90s                 float64
Tkl                   int64
TklW                  int64
Def 3rd               int64
Mid 3rd               int64
Att 3rd               int64
DribTkl              object
Att                   int64
Tkl%                float64
Lost                  int64
Blocks                int64
Sh                    int64
Pass                  int64
Int                   int64
Tkl+Int               int64
Clr                   int64
Err                   int64
Matches               int64
Interceptions/90    float64
Tackles/90          float64
90s Played          float64
dtype: object


In [20]:
passing_df_cleaned = clean_header_rows(passing_df, 26)
print(list(passing_df_cleaned.columns))

['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Ast', 'xAG', 'xA', 'A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP', 'Matches', '2017-2018']


In [40]:
print(passing_df_cleaned.dtypes)
print(passing_df_cleaned['Born'].isnull().sum())
print(passing_df_cleaned['Born'].unique())

26
Rk           object
Player       object
Nation       object
Pos          object
Squad        object
Comp         object
Age          object
Born         object
90s          object
Cmp          object
Att          object
Cmp%         object
TotDist      object
PrgDist      object
Cmp          object
Att          object
Cmp%         object
Cmp          object
Att          object
Cmp%         object
Cmp          object
Att          object
Cmp%         object
Ast          object
xAG          object
xA           object
A-xAG        object
KP           object
1/3          object
PPA          object
CrsPA        object
PrgP         object
Matches      object
2017-2018    object
dtype: object
5
['1990' '1995' '1986' '1989' '1997' '1992' '1988' '1985' '1987' '1998'
 '2000' '1993' '1981' '1996' '1994' '1999' '1991' '1983' '1984' '2001'
 '1980' '1977' '1982' '1978' '1979' nan 'Born' '2002' '2003' '2004' '2005'
 '2006' '2007' '2008']


In [42]:
passing_df_cleaned = process_data_prime(
    "uncleaned_data_csv/passing.csv",
    26,
    numeric_columns_passing,
    duplicate_columns_info=duplicate_columns_passing,
    per90_columns=per90_cols_passing
)

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [37]:
defending_df_cleaned = process_data_prime(
    'uncleaned_data_csv/defending.csv',
    26,
    numeric_columns_defending,
    duplicate_columns_info=duplicate_columns_def,
    per90_columns=per90_cols_defending
)

In [38]:
# Test for Defending Data
def test_defending_df(defending_df_cleaned):
    print("Testing Defending DataFrame...")
    # Check 'Season' column exists
    assert "Season" in defending_df_cleaned.columns, "Season column missing in defending_df_cleaned"
    # Check for unique columns
    assert defending_df_cleaned.columns.is_unique, "Defending DataFrame columns are not unique"
    print("Defending DataFrame columns:", list(defending_df_cleaned.columns))
    print("Defending DataFrame test passed!\n")

# Test for Passing Data
def test_passing_df(passing_df_cleaned):
    print("Testing Passing DataFrame...")
    # Check 'Season' column exists
    assert "Season" in passing_df_cleaned.columns, "Season column missing in passing_df_cleaned"
    # Check for renamed duplicate columns
    expected_cols = [
        "Cmp_Total", "Cmp_Short", "Cmp_Medium", "Cmp_Long",
        "Att_Total", "Att_Short", "Att_Medium", "Att_Long",
        "Cmp%_Total", "Cmp%_Short", "Cmp%_Medium", "Cmp%_Long"
    ]
    for col in expected_cols:
        assert col in passing_df_cleaned.columns, f"{col} missing in passing_df_cleaned"
    print("Passing DataFrame columns:", [col for col in passing_df_cleaned.columns if "Cmp" in col or "Att" in col])
    print("Passing DataFrame test passed!\n")

# Run the tests
test_defending_df(defending_df_cleaned)
test_passing_df(passing_df_cleaned)

Testing Defending DataFrame...
Defending DataFrame columns: ['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', '90s', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'DribTkl', 'Att', 'Tkl%', 'Lost', 'Blocks', 'Sh', 'Pass', 'Int', 'Tkl+Int', 'Clr', 'Err', 'Matches', 'Season', '90s Played', 'Interceptions/90', 'Tackles/90']
Defending DataFrame test passed!

Testing Passing DataFrame...


AssertionError: Season column missing in passing_df_cleaned