In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os

In [2]:
RAW_DATA_FILE = "data/raw/raw_player_data.csv"

filename = RAW_DATA_FILE


def load():
    player_data = pd.read_csv(filename)

    print(player_data.shape)
    print(player_data.head(1))
    
    return player_data

In [3]:
player_data = load()

player_data

(91672, 88)
   Unnamed: 0         UID        Name    Rec                       DOB  Inf  \
0           0  2002078863  Scott King  - - -  10/9/2004 (17 years old)  Ama   

                     Club                       Based  Nat Height  ... Pres  \
0  Colorado International  U.S.A. (Mountain Division)  USA   5'9"  ...   14   

   Loy Inj Pr Imp M Dirt Amb Ada Cons Cont  Media Handling  
0    9     11    10   15  11   7   14   11  Media-friendly  

[1 rows x 88 columns]


Unnamed: 0.1,Unnamed: 0,UID,Name,Rec,DOB,Inf,Club,Based,Nat,Height,...,Pres,Loy,Inj Pr,Imp M,Dirt,Amb,Ada,Cons,Cont,Media Handling
0,0,2002078863,Scott King,- - -,10/9/2004 (17 years old),Ama,Colorado International,U.S.A. (Mountain Division),USA,"5'9""",...,14,9,11,10,15,11,7,14,11,Media-friendly
1,1,2002078833,José González,- - -,23/10/2004 (17 years old),Ctr,NC Fusion U23,U.S.A. (South Atlantic Division),USA,"5'4""",...,14,9,3,12,14,11,6,9,8,Media-friendly
2,2,2002078815,Chase Crane,- - -,6/12/2004 (17 years old),Ctr,NC Fusion U23,U.S.A. (South Atlantic Division),USA,"6'2""",...,9,11,2,8,15,13,10,8,9,Level-headed
3,3,2002078813,Emmanuel Araiza,- - -,30/11/2004 (17 years old),Ctr,Oly Town FC,U.S.A. (Northwest Division),USA,"6'1""",...,12,10,4,13,12,10,13,10,10,Media-friendly
4,4,2002078799,Yang Xiaofeng,- - -,30/6/2004 (17 years old),Ctr,Oly Town FC,U.S.A. (Northwest Division),CHN,"6'0""",...,9,11,14,7,7,6,11,7,5,Level-headed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91667,91667,20030405,Hunter Gorskie,- - -,27/6/1991 (31 years old),,Monterey Bay,U.S.A. (USSL-C Western Conference),USA,"5'11""",...,10,11,9,13,6,11,12,16,8,Media-friendly
91668,91668,19074835,Marcos Vinícius,- - -,27/6/1991 (31 years old),,Foz do Iguaçu,Brazil (Lower Division),BRA,"6'3""",...,11,13,13,5,5,11,15,10,6,Level-headed
91669,91669,14023371,Germán Pezzella,- - -,27/6/1991 (31 years old),,Real Hispalis,Spain (First Division),ARG,"6'2""",...,12,15,10,13,13,12,14,11,12,Level-headed
91670,91670,37001809,Jordy Clasie,- - -,27/6/1991 (31 years old),,AZ,Netherlands (Eredivisie),NED,"5'7""",...,16,16,9,14,12,14,7,17,8,Level-headed


In [4]:
player_data.columns
player_data["Transfer Value"].sort_values()

0                  0$
48885              0$
16814              0$
16813              0$
48887              0$
             ...     
78985    Not for Sale
88470    Not for Sale
53564    Not for Sale
84431    Not for Sale
89387    Not for Sale
Name: Transfer Value, Length: 91672, dtype: object

In [5]:
# Data Cleaning and Preprocessing

player_data.columns = player_data.columns.str.strip()

# Remove unwanted index column if present
if 'Unnamed: 0' in player_data.columns:
    player_data.drop(columns=['Unnamed: 0'], inplace=True)

# Convert 'DOB' to datetime; note that errors='coerce' will convert invalid dates to NaT
# player_data['DOB'] = pd.to_datetime(player_data['DOB'], format="mixed")

# Step 1: Extract the date part (everything before the space)
player_data['Date'] = player_data['DOB'].str.extract(r'(^[\d/]+)')

# Step 2: Convert to datetime
player_data['DOB'] = pd.to_datetime(player_data['Date'], format='%d/%m/%Y')

today = dt.datetime.strptime("01/01/2022", '%d/%m/%Y')
player_data['calculated_age'] = (today - player_data['DOB']).dt.days // 365

# Some currency columns like 'Transfer Value' are stored as strings. We try to clean them.
def clean_currency(val):
    if pd.isnull(val):
        return np.nan
    # Remove any currency symbols or commas
    val = str(val).replace('$', '').replace('€', '').replace(',', '').strip()
    try:
        return float(val)
    except:
        return np.nan

if 'Transfer Value' in player_data.columns:
    player_data['Transfer Value Clean'] = player_data['Transfer Value'].apply(clean_currency)

# Check the first few rows after cleaning
player_data.sort_values(by="DOB").head()

Unnamed: 0,UID,Name,Rec,DOB,Inf,Club,Based,Nat,Height,Weight,...,Imp M,Dirt,Amb,Ada,Cons,Cont,Media Handling,Date,calculated_age,Transfer Value Clean
77463,5700575,Kim Ki-Bum,- - -,1976-08-14,,Busan TC,South Korea (K3 League),KOR,"5'9""",65 kg,...,14,8,5,15,14,5,Media-friendly,14/8/1976,45,0.0
77464,6300071,Christian González,- - -,1976-08-30,,RANS Nusantara,Indonesia (League One),IDN,"5'10""",78 kg,...,12,13,7,14,9,7,Outspoken,30/8/1976,45,
77465,82000936,Aljame Zuill,- - -,1976-10-08,,Devonshire Colts,Bermuda (First Division),BER,"6'2""",79 kg,...,12,5,12,11,9,5,Media-friendly,8/10/1976,45,
77466,3201306,Hocine Chebaïki,- - -,1976-11-12,Ama,AC Le Roeulx,Belgium (Hainaut Provincial),BEL,"5'9""",74 kg,...,6,5,10,8,11,6,Level-headed,12/11/1976,45,0.0
77467,6704742,Sergio Aragoneses,- - -,1977-02-01,,Esperanza,Spain (Regional Division),ESP,"5'11""",79 kg,...,9,14,8,7,9,12,Level-headed,1/2/1977,44,


In [6]:
series = pd.MultiIndex.from_frame(player_data)
series.stack()

AttributeError: 'MultiIndex' object has no attribute 'stack'