In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# ----load data-----
def load_data(filepath: str):    
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns.")
        return df
    except FileNotFoundError:
        print('File Not Found! Please check filepath and try again!')
        raise


In [3]:
# ----------dataset overview-----
def dataset_overview(df: pd.DataFrame):
    print('Number of observations : ',df.shape[0])
    print('Number of features : ',df.shape[1])
    print('--Data Info--')
    df.info()
    return df.describe(include='all')

In [4]:
# -------duplicate data---------
def duplicates(df: pd.DataFrame):
    duplicates = df[df.duplicated()]
    print('Number of duplicated rows : ',len(duplicates))
    if len(duplicates) == 0:
        print('No duplicates found')
    return duplicates

In [5]:
# -----missing data---------
def missing_data(df: pd.DataFrame):
    missing_values = df.isnull().sum()
    missing_pct = (missing_values / len(df)) * 100
    missing_data = pd.DataFrame({
        'Missing Values' : missing_values,
        'Missing Pct' : missing_pct.round(2)
    }).sort_values(by='Missing Pct',ascending=False)
    print('---------Missing Data----------\n')
    display(missing_data.head(10))
    return missing_data

In [6]:
# ---column summaries--------
def column_summaries(df: pd.DataFrame):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for i,col in enumerate(numeric_cols,1):
        print(f'{i:<2}. {col:<17} -Min : {df[col].min():<4} -Max : {df[col].max()}')

    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    for i,col in enumerate(categorical_cols,1):
        print(f'{i}. {col} -{df[col].unique()} - {df[col].nunique()}')
    return numeric_cols, categorical_cols

In [7]:
# outlier detection using IQR
def check_outliers(df: pd.DataFrame, col: str):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outlier, upper_bound, lower_bound

def outlier_summary(df: pd.DataFrame, numeric_cols):
    print("\n Outlier Summary (IQR Method):")
    for i, col in enumerate(numeric_cols, 1):
        outlier, lower, upper = check_outliers(df, col)
        print(f"{i:<2}. {col:<20} - {len(outlier)} outliers (Range: {lower:.2f} - {upper:.2f})")

In [8]:
def run_basic_eda(filepath: str):
    df = load_data(filepath)
    dataset_overview(df)
    duplicates(df)
    missing_data(df)
    numeric_cols,category_cols = column_summaries(df)
    outlier_summary(df,numeric_cols)
    return df

In [9]:
if __name__ == '__main__':
    df = run_basic_eda("fifa21 raw data v2.csv")

  df = pd.read_csv(filepath)


Data loaded successfully: 18979 rows, 77 columns.
Number of observations :  18979
Number of features :  77
--Data Info--
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18979 entries, 0 to 18978
Data columns (total 77 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                18979 non-null  int64 
 1   Name              18979 non-null  object
 2   LongName          18979 non-null  object
 3   photoUrl          18979 non-null  object
 4   playerUrl         18979 non-null  object
 5   Nationality       18979 non-null  object
 6   Age               18979 non-null  int64 
 7   ↓OVA              18979 non-null  int64 
 8   POT               18979 non-null  int64 
 9   Club              18979 non-null  object
 10  Contract          18979 non-null  object
 11  Positions         18979 non-null  object
 12  Height            18979 non-null  object
 13  Weight            18979 non-null  object
 14  Preferred Foot    18979 non-n

Unnamed: 0,Missing Values,Missing Pct
Loan Date End,17966,94.66
Hits,2595,13.67
LongName,0,0.0
Name,0,0.0
ID,0,0.0
Nationality,0,0.0
Age,0,0.0
↓OVA,0,0.0
POT,0,0.0
Club,0,0.0


1 . ID                -Min : 41   -Max : 259216
2 . Age               -Min : 16   -Max : 53
3 . ↓OVA              -Min : 47   -Max : 93
4 . POT               -Min : 47   -Max : 95
5 . BOV               -Min : 48   -Max : 93
6 . Attacking         -Min : 42   -Max : 437
7 . Crossing          -Min : 6    -Max : 94
8 . Finishing         -Min : 3    -Max : 95
9 . Heading Accuracy  -Min : 5    -Max : 93
10. Short Passing     -Min : 7    -Max : 94
11. Volleys           -Min : 3    -Max : 90
12. Skill             -Min : 40   -Max : 470
13. Dribbling         -Min : 5    -Max : 96
14. Curve             -Min : 4    -Max : 94
15. FK Accuracy       -Min : 5    -Max : 94
16. Long Passing      -Min : 5    -Max : 93
17. Ball Control      -Min : 5    -Max : 96
18. Movement          -Min : 122  -Max : 464
19. Acceleration      -Min : 13   -Max : 97
20. Sprint Speed      -Min : 12   -Max : 96
21. Agility           -Min : 14   -Max : 96
22. Reactions         -Min : 24   -Max : 95
23. Balance           -Mi