In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s : %(message)s')

In [3]:
# ----load data-----
def load_data(filepath: str):    
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns.")
        return df
    except FileNotFoundError:
        print('File Not Found! Please check filepath and try again!')
        raise


In [4]:
# ----------dataset overview-----
def dataset_overview(df: pd.DataFrame):
    logging.info(f'Number of observations : {df.shape[0]}')
    logging.info(f'Number of features : {df.shape[1]}')
    overview = pd.DataFrame({
        "Dtype": df.dtypes,
        "Non-Null Count": df.count(),
        "Null Count": df.isnull().sum(),
        "Unique Values": df.nunique()
    })
    display(overview.head(10))
    return df.describe(include='all')

In [5]:
# -------duplicate data---------
def duplicates(df: pd.DataFrame):
    duplicates = df[df.duplicated()]
    logging.info(f'Number of duplicated rows : {len(duplicates)}')
    if len(duplicates) == 0:
        logging.info(f'No duplicates found')
    return duplicates

In [6]:
# -----missing data---------
def missing_data(df: pd.DataFrame):
    missing_values = df.isnull().sum()
    missing_pct = (missing_values / len(df)) * 100
    missing_data = pd.DataFrame({
        'Missing Values' : missing_values,
        'Missing Pct' : missing_pct.round(2)
    }).sort_values(by='Missing Pct',ascending=False)
    logging.info(f'---------Missing Data(Top 10)----------\n')
    display(missing_data.head(10))
    return missing_data

In [7]:
# ---column summaries--------
def column_summaries(df: pd.DataFrame):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for i,col in enumerate(numeric_cols,1):
        logging.info(f'{i:<2}. {col:<17} -Min : {df[col].min():<4} -Max : {df[col].max()}')

    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    for i,col in enumerate(categorical_cols,1):
        uniques = df[col].unique()
        logging.info(f'{i}. {col} | Unique : {df[col].nunique()} | Examples : {uniques[:5]}')
    return numeric_cols, categorical_cols

In [8]:
# outlier detection using IQR
def check_outliers(df: pd.DataFrame, col: str):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outlier, upper_bound, lower_bound

def outlier_summary(df: pd.DataFrame, numeric_cols):
    results = []
    logging.info(f"\n Outlier Summary (IQR Method):")
    for i, col in enumerate(numeric_cols, 1):
        outlier, lower, upper = check_outliers(df, col)
        results.append({
            'column' : col,
            'Outlier_count' : len(outlier),
            'lower_bound' : lower,
            'upper_bound' : upper
        })
    summary_df = pd.DataFrame(results)
    display(summary_df)
    return summary_df
        

In [9]:
# ---- Save Reports ----
import os
def save_summary(df: pd.DataFrame, name: str):
    os.makedirs("eda_reports", exist_ok=True)
    path = f"eda_reports/{name}.csv"
    df.to_csv(path, index=False)
    logging.info(f"Saved report: {path}")

In [10]:
def run_basic_eda(filepath: str):
    df = load_data(filepath)

    # ---- Basic cleaning ----
    if 'Hits' in df.columns:
        df['Hits'] = df['Hits'].astype(str).str.extract(r'(\d+)').astype(float)
        logging.info("Column 'Hits' cleaned and converted to numeric.")

    overview = dataset_overview(df)
    duplicate = duplicates(df)
    missing = missing_data(df)
    numeric_cols, category_cols = column_summaries(df)
    outlier_df = outlier_summary(df, numeric_cols)

    save_summary(missing, "missing_data")
    save_summary(outlier_df, "outlier_summary")

    return {
        'data': df,
        'overview': overview,
        'duplicate': duplicate,
        'missing': missing,
        'outliers': outlier_df,
        'numeric_cols': numeric_cols,
        'category_cols': category_cols
    }


In [11]:
if __name__ == '__main__':
    df = run_basic_eda("fifa21 raw data v2.csv")

  df = pd.read_csv(filepath)
INFO : Column 'Hits' cleaned and converted to numeric.
INFO : Number of observations : 18979
INFO : Number of features : 77


Data loaded successfully: 18979 rows, 77 columns.


Unnamed: 0,Dtype,Non-Null Count,Null Count,Unique Values
ID,int64,18979,0,18979
Name,object,18979,0,17920
LongName,object,18979,0,18852
photoUrl,object,18979,0,18979
playerUrl,object,18979,0,18979
Nationality,object,18979,0,164
Age,int64,18979,0,29
↓OVA,int64,18979,0,47
POT,int64,18979,0,48
Club,object,18979,0,682


INFO : Number of duplicated rows : 0
INFO : No duplicates found
INFO : ---------Missing Data(Top 10)----------



Unnamed: 0,Missing Values,Missing Pct
Loan Date End,17966,94.66
Hits,2595,13.67
LongName,0,0.0
Name,0,0.0
ID,0,0.0
Nationality,0,0.0
Age,0,0.0
↓OVA,0,0.0
POT,0,0.0
Club,0,0.0


INFO : 1 . ID                -Min : 41   -Max : 259216
INFO : 2 . Age               -Min : 16   -Max : 53
INFO : 3 . ↓OVA              -Min : 47   -Max : 93
INFO : 4 . POT               -Min : 47   -Max : 95
INFO : 5 . BOV               -Min : 48   -Max : 93
INFO : 6 . Attacking         -Min : 42   -Max : 437
INFO : 7 . Crossing          -Min : 6    -Max : 94
INFO : 8 . Finishing         -Min : 3    -Max : 95
INFO : 9 . Heading Accuracy  -Min : 5    -Max : 93
INFO : 10. Short Passing     -Min : 7    -Max : 94
INFO : 11. Volleys           -Min : 3    -Max : 90
INFO : 12. Skill             -Min : 40   -Max : 470
INFO : 13. Dribbling         -Min : 5    -Max : 96
INFO : 14. Curve             -Min : 4    -Max : 94
INFO : 15. FK Accuracy       -Min : 5    -Max : 94
INFO : 16. Long Passing      -Min : 5    -Max : 93
INFO : 17. Ball Control      -Min : 5    -Max : 96
INFO : 18. Movement          -Min : 122  -Max : 464
INFO : 19. Acceleration      -Min : 13   -Max : 97
INFO : 20. Sprint Speed 

Unnamed: 0,column,Outlier_count,lower_bound,upper_bound
0,ID,250,302103.75,154953.75
1,Age,8,41.0,9.0
2,↓OVA,156,83.5,47.5
3,POT,153,87.0,55.0
4,BOV,137,84.5,48.5
5,Attacking,2016,409.5,109.5
6,Crossing,0,100.5,0.5
7,Finishing,0,110.0,-18.0
8,Heading Accuracy,1108,94.0,14.0
9,Short Passing,1748,89.0,33.0


INFO : Saved report: eda_reports/missing_data.csv
INFO : Saved report: eda_reports/outlier_summary.csv


In [12]:
# Temporary compatibility fix for NumPy >= 2.0
import builtins
if not hasattr(np, "VisibleDeprecationWarning"):
    np.VisibleDeprecationWarning = builtins.DeprecationWarning

import sweetviz as sv

from pkg_resources import resource_filename
# create report
data = df['data']
report = sv.analyze(data)

# generate and show report
report.show_html("fifa21_sweetviz_report.html")

logging.info("Sweetviz report saved as eda_reports/fifa21_sweetviz_report.html")


  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import resource_filename
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
  figure.savefig(as_raw_bytes, format='png', transparent=True)
Done! Use 'show' commands to display/save.   |██████████| [100%]   00:18 -> (00:00 left)


Report fifa21_sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


INFO : Sweetviz report saved as eda_reports/fifa21_sweetviz_report.html
