In [2]:
# DATA FOR PUBLIC CASES
# the main libraries used in the project
import pandas as pd


# Load the dataset
publicData = pd.read_json("../data/public_cases.json")

#flatten the nested input in -> columns  df=data frame, the 2d table  publicData is the public_cases.json data that is being processed
df = pd.json_normalize(publicData["input"])
df["expected_output"] = publicData["expected_output"]

df.head()


Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount,expected_output
0,3,93.0,1.42,364.51
1,1,55.0,3.6,126.06
2,1,47.0,17.97,128.91
3,2,13.0,4.67,203.52
4,3,88.0,5.78,380.37


In [3]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_duration_days     1000 non-null   int64  
 1   miles_traveled         1000 non-null   float64
 2   total_receipts_amount  1000 non-null   float64
 3   expected_output        1000 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 31.4 KB


Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount,expected_output
count,1000.0,1000.0,1000.0,1000.0
mean,7.043,597.41374,1211.05687,1349.11403
std,3.926139,351.29979,742.85418,470.316464
min,1.0,5.0,1.42,117.24
25%,4.0,275.96,530.3775,1019.2975
50%,7.0,621.0,1171.9,1454.26
75%,10.0,893.0,1881.105,1711.1225
max,14.0,1317.07,2503.46,2337.73


In [4]:

privateData = pd.read_json("../data/private_cases.json")



df_private = pd.DataFrame(privateData)
df_private.head()



Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount
0,3,191.0,21.87
1,1,81.0,3.93
2,3,204.0,16.76
3,1,64.0,22.47
4,3,204.0,22.19


In [22]:
df_private.info()
df_private.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trip_duration_days     5000 non-null   int64  
 1   miles_traveled         5000 non-null   float64
 2   total_receipts_amount  5000 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 117.3 KB


Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount
count,5000.0,5000.0,5000.0
mean,7.1998,589.076406,1186.129164
std,3.998034,346.67372,719.695557
min,1.0,5.0,0.27
25%,4.0,281.0,567.845
50%,7.0,588.0,1154.81
75%,11.0,886.0,1796.4875
max,14.0,1348.59,2502.71


In [7]:
def check_data_quality(df, name="Dataset", has_output=False):
    print(f"checking data quality for : {name}")
    print("-"*60)

    #1 missing values
    print("\n missing values:")
    print(df.isna().sum())

    #2 duplicates
    duplicate_count = df.duplicated().sum()
    print(f"duplicates count: {duplicate_count}")

    #data types
    print("\n data types")
    print(df.dtypes)

    #Basic statistics
    print("\n statistic summary:")
    print(df.describe())

    #validate ranges
    print("\n range validation")
    if(df["trip_duration_days"] <= 0).any():
        print("trip_duration_days contains non positive values")

    else:
        print("trip_duration_days is valid")

    #miles traveled

    if(df["miles_traveled"] <= 0).any():
        print("miles_traveled contains non positive values")

    else:
        print("miles_traveled is valid")
    #total_receipts_amount
    if(df["total_receipts_amount"] <= 0).any():
        print("total_receipts_amount contains non positive values")

    else:
        print("total_receipts_amount is valid")

    #expected_output (public data only
    if has_output and "expected_output" in df.columns:
        if (df['expected_output']<=0).any():
            print("expected_output contains non positive values")
        else:
            print("expected_output is valid")

    print("-"*60)

In [36]:
check_data_quality(df, "public_cases", has_output=True)
check_data_quality(df, "private_cases", has_output=False)

checking data quality for : public_cases
------------------------------------------------------------

 missing values:
trip_duration_days       0
miles_traveled           0
total_receipts_amount    0
expected_output          0
dtype: int64
duplicates count: 0

 data types
trip_duration_days         int64
miles_traveled           float64
total_receipts_amount    float64
expected_output          float64
dtype: object

 statistic summary:
       trip_duration_days  miles_traveled  total_receipts_amount  \
count         1000.000000      1000.00000             1000.00000   
mean             7.043000       597.41374             1211.05687   
std              3.926139       351.29979              742.85418   
min              1.000000         5.00000                1.42000   
25%              4.000000       275.96000              530.37750   
50%              7.000000       621.00000             1171.90000   
75%             10.000000       893.00000             1881.10500   
max            