# Data Ingestion Pipeline

- Import important libraries
- Move data from cached kagglehub dataset folder into data/raw folder in project


In [95]:
import kagglehub
import shutil
from pathlib import Path

In [96]:
path = kagglehub.dataset_download("bryanb/fifa-player-stats-database")

In [97]:
print("Path to dataset files:", path)

Path to dataset files: C:\Users\HP\.cache\kagglehub\datasets\bryanb\fifa-player-stats-database\versions\35


In [98]:
if Path("../data/raw/35"):
    print("data exists!")
else:    
    shutil.move(f"C:/Users/HP/.cache/kagglehub/datasets/bryanb/fifa-player-stats-database/versions/35","../data/raw")

data exists!


# Data Transformation Pipeline

- Transforming data (Clean)

In [99]:
import numpy as np
import pandas as pd
import datetime
import re

In [100]:
df = pd.read_csv("../data/raw/35/FIFA23_official_data.csv")

In [101]:
df.columns

Index(['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Joined', 'Loaned From', 'Contract Valid Until', 'Height', 'Weight',
       'Release Clause', 'Kit Number', 'Best Overall Rating'],
      dtype='object')

In [102]:
df

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Real Face,Position,Joined,Loaned From,Contract Valid Until,Height,Weight,Release Clause,Kit Number,Best Overall Rating
0,209658,L. Goretzka,27,https://cdn.sofifa.net/players/209/658/23_60.png,Germany,https://cdn.sofifa.net/flags/de.png,87,88,FC Bayern München,https://cdn.sofifa.net/teams/21/30.png,...,Yes,"<span class=""pos pos28"">SUB","Jul 1, 2018",,2026,189cm,82kg,€157M,8.0,
1,212198,Bruno Fernandes,27,https://cdn.sofifa.net/players/212/198/23_60.png,Portugal,https://cdn.sofifa.net/flags/pt.png,86,87,Manchester United,https://cdn.sofifa.net/teams/11/30.png,...,Yes,"<span class=""pos pos15"">LCM","Jan 30, 2020",,2026,179cm,69kg,€155M,8.0,
2,224334,M. Acuña,30,https://cdn.sofifa.net/players/224/334/23_60.png,Argentina,https://cdn.sofifa.net/flags/ar.png,85,85,Sevilla FC,https://cdn.sofifa.net/teams/481/30.png,...,No,"<span class=""pos pos7"">LB","Sep 14, 2020",,2024,172cm,69kg,€97.7M,19.0,
3,192985,K. De Bruyne,31,https://cdn.sofifa.net/players/192/985/23_60.png,Belgium,https://cdn.sofifa.net/flags/be.png,91,91,Manchester City,https://cdn.sofifa.net/teams/10/30.png,...,Yes,"<span class=""pos pos13"">RCM","Aug 30, 2015",,2025,181cm,70kg,€198.9M,17.0,
4,224232,N. Barella,25,https://cdn.sofifa.net/players/224/232/23_60.png,Italy,https://cdn.sofifa.net/flags/it.png,86,89,Inter,https://cdn.sofifa.net/teams/44/30.png,...,Yes,"<span class=""pos pos13"">RCM","Sep 1, 2020",,2026,172cm,68kg,€154.4M,23.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17655,269526,Deng Xiongtao,19,https://cdn.sofifa.net/players/269/526/23_60.png,China PR,https://cdn.sofifa.net/flags/cn.png,48,61,Meizhou Hakka,https://cdn.sofifa.net/teams/114628/30.png,...,No,"<span class=""pos pos29"">RES","Apr 11, 2022",,2027,190cm,78kg,€218K,35.0,
17656,267946,22 Lim Jun Sub,17,https://cdn.sofifa.net/players/267/946/22_60.png,Korea Republic,https://cdn.sofifa.net/flags/kr.png,48,64,Jeju United FC,https://cdn.sofifa.net/teams/1478/30.png,...,No,"<span class=""pos pos29"">RES","Jan 1, 2022",,2026,195cm,84kg,€188K,21.0,
17657,270567,A. Demir,25,https://cdn.sofifa.net/players/270/567/23_60.png,Turkey,https://cdn.sofifa.net/flags/tr.png,51,56,Ümraniyespor,https://cdn.sofifa.net/teams/113796/30.png,...,No,"<span class=""pos pos29"">RES","Jun 6, 2021",,2023,190cm,82kg,€142K,12.0,
17658,256624,21 S. Czajor,18,https://cdn.sofifa.net/players/256/624/21_60.png,Poland,https://cdn.sofifa.net/flags/pl.png,50,65,Fleetwood Town,https://cdn.sofifa.net/teams/112260/30.png,...,No,"<span class=""pos pos29"">RES","Jan 1, 2020",,2021,187cm,79kg,€214K,40.0,


In [103]:
df_fifa = df[["Age","Nationality","Overall","Potential","Club","Joined","Contract Valid Until","Height","Weight","Release Clause"]].copy()

In [104]:
df_fifa

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Joined,Contract Valid Until,Height,Weight,Release Clause
0,27,Germany,87,88,FC Bayern München,"Jul 1, 2018",2026,189cm,82kg,€157M
1,27,Portugal,86,87,Manchester United,"Jan 30, 2020",2026,179cm,69kg,€155M
2,30,Argentina,85,85,Sevilla FC,"Sep 14, 2020",2024,172cm,69kg,€97.7M
3,31,Belgium,91,91,Manchester City,"Aug 30, 2015",2025,181cm,70kg,€198.9M
4,25,Italy,86,89,Inter,"Sep 1, 2020",2026,172cm,68kg,€154.4M
...,...,...,...,...,...,...,...,...,...,...
17655,19,China PR,48,61,Meizhou Hakka,"Apr 11, 2022",2027,190cm,78kg,€218K
17656,17,Korea Republic,48,64,Jeju United FC,"Jan 1, 2022",2026,195cm,84kg,€188K
17657,25,Turkey,51,56,Ümraniyespor,"Jun 6, 2021",2023,190cm,82kg,€142K
17658,18,Poland,50,65,Fleetwood Town,"Jan 1, 2020",2021,187cm,79kg,€214K


In [105]:
df_fifa["Joined"] = pd.to_datetime(df_fifa.Joined)

In [106]:
df_fifa["month"] = df_fifa.Joined.dt.month
df_fifa["year"] = df_fifa.Joined.dt.year


In [107]:
df_fifa

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Joined,Contract Valid Until,Height,Weight,Release Clause,month,year
0,27,Germany,87,88,FC Bayern München,2018-07-01,2026,189cm,82kg,€157M,7.0,2018.0
1,27,Portugal,86,87,Manchester United,2020-01-30,2026,179cm,69kg,€155M,1.0,2020.0
2,30,Argentina,85,85,Sevilla FC,2020-09-14,2024,172cm,69kg,€97.7M,9.0,2020.0
3,31,Belgium,91,91,Manchester City,2015-08-30,2025,181cm,70kg,€198.9M,8.0,2015.0
4,25,Italy,86,89,Inter,2020-09-01,2026,172cm,68kg,€154.4M,9.0,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17655,19,China PR,48,61,Meizhou Hakka,2022-04-11,2027,190cm,78kg,€218K,4.0,2022.0
17656,17,Korea Republic,48,64,Jeju United FC,2022-01-01,2026,195cm,84kg,€188K,1.0,2022.0
17657,25,Turkey,51,56,Ümraniyespor,2021-06-06,2023,190cm,82kg,€142K,6.0,2021.0
17658,18,Poland,50,65,Fleetwood Town,2020-01-01,2021,187cm,79kg,€214K,1.0,2020.0


In [108]:
def replace_char(x):
    return re.sub("[^0-9|.]","",x)

In [109]:
df_fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17660 entries, 0 to 17659
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Age                   17660 non-null  int64         
 1   Nationality           17660 non-null  object        
 2   Overall               17660 non-null  int64         
 3   Potential             17660 non-null  int64         
 4   Club                  17449 non-null  object        
 5   Joined                16562 non-null  datetime64[ns]
 6   Contract Valid Until  17299 non-null  object        
 7   Height                17660 non-null  object        
 8   Weight                17660 non-null  object        
 9   Release Clause        16509 non-null  object        
 10  month                 16562 non-null  float64       
 11  year                  16562 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 1.6+ M

In [110]:
df_fifa.Height = df_fifa.Height.map(replace_char)
df_fifa.Weight = df_fifa.Weight.map(replace_char)
df_fifa["Release Clause"] = df_fifa["Release Clause"].map(replace_char,na_action='ignore')

In [111]:
df_fifa

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Joined,Contract Valid Until,Height,Weight,Release Clause,month,year
0,27,Germany,87,88,FC Bayern München,2018-07-01,2026,189,82,157,7.0,2018.0
1,27,Portugal,86,87,Manchester United,2020-01-30,2026,179,69,155,1.0,2020.0
2,30,Argentina,85,85,Sevilla FC,2020-09-14,2024,172,69,97.7,9.0,2020.0
3,31,Belgium,91,91,Manchester City,2015-08-30,2025,181,70,198.9,8.0,2015.0
4,25,Italy,86,89,Inter,2020-09-01,2026,172,68,154.4,9.0,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17655,19,China PR,48,61,Meizhou Hakka,2022-04-11,2027,190,78,218,4.0,2022.0
17656,17,Korea Republic,48,64,Jeju United FC,2022-01-01,2026,195,84,188,1.0,2022.0
17657,25,Turkey,51,56,Ümraniyespor,2021-06-06,2023,190,82,142,6.0,2021.0
17658,18,Poland,50,65,Fleetwood Town,2020-01-01,2021,187,79,214,1.0,2020.0
