In [1]:
import pandas as pd
import os 

def inspect_df(df: pd.DataFrame, name: str = None, n: int = 20) -> None:

    title = f"=== DataFrame Inspection: {name} ===" if name else "=== DataFrame Inspection ==="
    print(f"\n==={title}===")
    print("=" * len(title))

    print("\n=== Dimension ===")
    print(df.shape)

    print("\n=== DF Info ===")
    df.info()

    print(f"\n=== {n} First Rows ===")
    display(df.head(n)) # Tableau HTML

    print(f"\n=== {n} Random Rows ===")
    display(df.sample(n, random_state=42))

    print("\n=== Descriptive Stats ===")
    display(df.describe(include="all").T)

    print("\n=== Unique Value ===")
    print(df.nunique())

    print("\n=== Number of NaN Values ===")
    print(df.isna().sum())

    print("\n=== Number of Duplicates Rows ===")
    print(df.duplicated().sum())

    print("\n=== Duplicates Rows ===")
    print(df[df.duplicated()])

def main():
    path = "data/raw/animal_data_dirty_reworked.csv"
    df = pd.read_csv(path, sep=";")

    filename = os.path.basename(path)
    inspect_df(df, name=filename)

if __name__ == "__main__":
    main()




=== Dimension ===
(1011, 11)

=== DF Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1011 entries, 0 to 1010
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Animal_type       1011 non-null   object 
 1   Country           1011 non-null   object 
 2   Weight_kg         984 non-null    float64
 3   Body_Length_cm    984 non-null    float64
 4   Gender            1011 non-null   object 
 5   Animal_code       0 non-null      float64
 6   Latitude          913 non-null    float64
 7   Longitude         913 non-null    float64
 8   Animal_name       1011 non-null   object 
 9   Observation_date  1011 non-null   object 
 10  Data_compiled_by  1011 non-null   object 
dtypes: float64(5), object(6)
memory usage: 87.0+ KB

=== 20 First Rows ===


Unnamed: 0,Animal_type,Country,Weight_kg,Body_Length_cm,Gender,Animal_code,Latitude,Longitude,Animal_name,Observation_date,Data_compiled_by
0,Unknown,Unknown,,,Unknown,,,,Unknown,03.01.2024,James Johnson
1,Unknown,Unknown,,,Unknown,,,,Unknown,03.02.2024,James Johnson
2,European bison,Poland,930.0,335.0,male,,52.828845,23.820144,Szefu,01.03.2024,Anne Anthony
3,European bison,Poland,909.0,311.0,not determined,,52.830509,23.826849,Unknown,01.03.2024,Anne Anthony
4,European bisonâ„¢,Poland,581.0,277.0,female,,52.834109,23.807093,Unknown,01.03.2024,Anne Anthony
5,European bisson,Poland,900.0,295.0,male,,52.834759,23.817201,Unknown,01.03.2024,Anne Anthony
6,European buster,Poland,620.0,250.0,female,,52.83496,23.82075,Unknown,01.03.2024,Anne Anthony
7,lynx,Hungary,24.0,76.0,male,,48.505494,20.551966,Unknown,01.03.2024,Anne Anthony
8,lynx?,Hungary,23.0,82.0,female,,48.506529,20.554548,Unknown,01.03.2024,Anne Anthony
9,red squirel,Poland,0.308,22.0,female,,52.241273,21.055745,Basia,01.03.2024,Anne Anthony



=== 20 Random Rows ===


Unnamed: 0,Animal_type,Country,Weight_kg,Body_Length_cm,Gender,Animal_code,Latitude,Longitude,Animal_name,Observation_date,Data_compiled_by
630,Unknown,Austria,0.255,22.0,male,,,,Unknown,06.04.2024,Bob Bobson
633,hedgehog,Slovakia,0.8,17.0,male,,48.409645,18.872415,Unknown,06.04.2024,John Johnson
685,red squirrel,Poland,0.325,20.0,female,,50.041347,19.950773,Unknown,11.04.2024,Anne Anthony
516,red squirrel,Poland,0.323,19.0,female,,50.041804,19.951361,Unknown,01.04.2024,Anne Anthony
528,hedgehog,Germany,0.8,23.0,male,,48.161452,11.495239,Unknown,02.04.2024,Bob Bobson
687,red squirrel,Hungary,0.285,20.0,male,,47.509655,18.943576,Unknown,11.04.2024,Anne Anthony
819,hedgehog,Germany,0.5,20.0,male,,49.55324,11.074008,Unknown,28.04.2024,Bob Bobson
532,hedgehog,Germany,1.1,23.0,male,,49.559558,11.08115,Unknown,02.04.2024,Bob Bobson
323,hedgehog,Poland,0.6,15.0,female,,52.214745,21.033701,Unknown,17.03.2024,Anne Anthony
70,red squirrel,Germany,0.314,22.0,male,,,,Unknown,02.03.2024,Bob Bobson



=== Descriptive Stats ===


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Animal_type,1011.0,13.0,red squirrel,543.0,,,,,,,
Country,1011.0,15.0,Poland,291.0,,,,,,,
Weight_kg,984.0,,,,39.745503,156.290076,-0.252,0.293,0.3315,0.8,1100.0
Body_Length_cm,984.0,,,,39.107724,58.628601,-19.0,19.0,21.0,23.0,350.0
Gender,1011.0,4.0,male,499.0,,,,,,,
Animal_code,0.0,,,,,,,,,,
Latitude,913.0,,,,49.393369,7.1689,-78.582973,48.186913,49.560723,52.212433,52.853843
Longitude,913.0,,,,18.20328,3.899601,11.074008,14.384559,18.944015,21.033243,34.896734
Animal_name,1011.0,11.0,Unknown,959.0,,,,,,,
Observation_date,1011.0,114.0,01.03.2024,61.0,,,,,,,



=== Unique Value ===
Animal_type          13
Country              15
Weight_kg           195
Body_Length_cm      112
Gender                4
Animal_code           0
Latitude            735
Longitude           743
Animal_name          11
Observation_date    114
Data_compiled_by      4
dtype: int64

=== Number of NaN Values ===
Animal_type            0
Country                0
Weight_kg             27
Body_Length_cm        27
Gender                 0
Animal_code         1011
Latitude              98
Longitude             98
Animal_name            0
Observation_date       0
Data_compiled_by       0
dtype: int64

=== Number of Duplicates Rows ===
167

=== Duplicates Rows ===
       Animal_type Country  Weight_kg  Body_Length_cm   Gender  Animal_code  \
51     red squirel  Poland      0.308            22.0   female          NaN   
52    red squirrel      PL      0.348            20.0   female          NaN   
53    red squirrel  Poland      0.316            20.0     male          NaN   
54 