In [2]:
import pandas as pd

In [3]:
df_cab_rides_preprocessed = pd.read_csv('../Dataset/cab_ride_preprocessed.csv')
df_weather_preprocessed = pd.read_csv('../Dataset/weather_preprocessed.csv')
df_cabride = pd.read_csv('../Dataset/cab_rides.csv')
df_weather = pd.read_csv('../Dataset/weather.csv')

In [4]:
def check_missing_values(df):
    missing_values_series = df.isnull().sum()
    total = len(df)
    missing_df = pd.DataFrame({
        'Missing count': missing_values_series,
        'Missing %': (missing_values_series / total * 100).round(2)
    })
    return missing_df

print("Missing values in the Cab rides dataset after preprocessing:")
print(check_missing_values(df_cab_rides_preprocessed))
print("Missing values in the Weather dataset after preprocessing: ")
print(check_missing_values(df_weather_preprocessed))

Missing values in the Cab rides dataset after preprocessing:
                  Missing count  Missing %
distance                      0        0.0
cab_type                      0        0.0
time_stamp                    0        0.0
destination                   0        0.0
source                        0        0.0
price                         0        0.0
surge_multiplier              0        0.0
id                            0        0.0
product_id                    0        0.0
name                          0        0.0
datetime                      0        0.0
date                          0        0.0
time                          0        0.0
category                      0        0.0
Missing values in the Weather dataset after preprocessing: 
           Missing count  Missing %
date_hour              0        0.0
location               0        0.0
temp                   0        0.0
clouds                 0        0.0
pressure               0        0.0
rain              

In [5]:
remain_data = dict()
raw_cab = len(df_cabride)
preprocessed_cab = len(df_cab_rides_preprocessed)
removed_cab = raw_cab - preprocessed_cab
removed_cab_pct = round(removed_cab / raw_cab * 100, 1)
cols_cab = len(df_cab_rides_preprocessed.columns)

remain_data['Cab rides'] = {
    'Raw': raw_cab,
    'Preprocessed': preprocessed_cab,
    'Removed': removed_cab,
    'Removed (%)': removed_cab_pct,
    'Columns': cols_cab
}

# Weather
raw_weather = len(df_weather)
preprocessed_weather = len(df_weather_preprocessed)
removed_weather = raw_weather - preprocessed_weather
removed_weather_pct = round(removed_weather / raw_weather * 100, 1)
cols_weather = len(df_weather_preprocessed.columns)

remain_data['Weather'] = {
    'Raw': raw_weather,
    'Preprocessed': preprocessed_weather,
    'Removed': removed_weather,
    'Removed (%)': removed_weather_pct,
    'Columns': cols_weather
}

print(remain_data)

{'Cab rides': {'Raw': 693071, 'Preprocessed': 637976, 'Removed': 55095, 'Removed (%)': 7.9, 'Columns': 14}, 'Weather': {'Raw': 6276, 'Preprocessed': 3960, 'Removed': 2316, 'Removed (%)': 36.9, 'Columns': 8}}


In [7]:
def check_outlier_IQR(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_lower_bound, outliers_upper_bound = 0, 0
    for value in data:
        if value < lower_bound:
            outliers_lower_bound += 1 
        elif value > upper_bound:
            outliers_upper_bound += 1

    return outliers_lower_bound, outliers_upper_bound


features = ['distance', 'price', 'surge_multiplier',
            'temp', 'clouds', 'pressure', 'rain', 'humidity', 'wind']

for feature in features:
    if feature in ['distance', 'price', 'surge_multiplier']:
        data = df_cab_rides_preprocessed[feature]
    else:
        data = df_weather_preprocessed[feature]

    outliers_lower_bound, outliers_upper_bound = check_outlier_IQR(data)
    print(f"Feature: {feature}")
    print(f"  - Outliers below lower bound: {outliers_lower_bound}")
    print(f"  - Outliers above upper bound: {outliers_upper_bound}")
    print("-" * 40)

Feature: distance
  - Outliers below lower bound: 0
  - Outliers above upper bound: 6672
----------------------------------------
Feature: price
  - Outliers below lower bound: 0
  - Outliers above upper bound: 5589
----------------------------------------
Feature: surge_multiplier
  - Outliers below lower bound: 0
  - Outliers above upper bound: 20975
----------------------------------------
Feature: temp
  - Outliers below lower bound: 106
  - Outliers above upper bound: 0
----------------------------------------
Feature: clouds
  - Outliers below lower bound: 0
  - Outliers above upper bound: 0
----------------------------------------
Feature: pressure
  - Outliers below lower bound: 0
  - Outliers above upper bound: 0
----------------------------------------
Feature: rain
  - Outliers below lower bound: 0
  - Outliers above upper bound: 575
----------------------------------------
Feature: humidity
  - Outliers below lower bound: 0
  - Outliers above upper bound: 0
----------------