# **Citi Bike Data Engineering  - EDA - Missing Values Analysis** 

#### Python Packages

In [25]:
import pandas as pd
import numpy as np
import csv
import sys
import os
import importlib
from itertools import combinations
from difflib import get_close_matches

#### Python Scripts

In [26]:
# Dynamically add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from scripts.create_markdown_table import create_markdown_table

#### Import DataFrames

In [27]:
df_dir = os.path.join(project_root, 'eda', 'dataframes')
newark_airport_df = pd.read_pickle(os.path.join(df_dir, 'newark_airport_df.pkl'))
citibike_df = pd.read_pickle(os.path.join(df_dir, 'citibike_df.pkl'))

# Confirm that the dataframes are loaded correctly
if newark_airport_df.empty or citibike_df.empty:
    raise ValueError("One or both dataframes are empty. Please check the data loading process.")
else:
    print("Dataframes loaded successfully.")

Dataframes loaded successfully.


#### Identify and quantify the number of missing values in relevant columns

In [28]:
"""
    Summarises missing data and infers possible missingness mechanisms.
    Returns a dataframe with column, missing count, percent, and heuristic-based type.
"""
def missing_data_summary(df):
    summary = []
    total_rows = len(df)

    for col in df.columns:
        missing_count = df[col].isna().sum()
        missing_pct = missing_count / total_rows

        if missing_count == 0:
            continue  # skip non-missing columns

        # Simple heuristic rules
        if missing_pct == 1.0:
            missing_type = "Structurally Missing Data (always null)"
        elif df[col].dtype == "object" and df[col].nunique() == 1:
            missing_type = "Possibly Structural or Redundant"
        elif missing_pct > 0 and missing_pct < 0.05:
            missing_type = "Likely MCAR (Minimal Missingness)"
        elif np.issubdtype(df[col].dtype, np.number):
            missing_type = "Possible MNAR (e.g. sensitive numeric)"
        else:
            missing_type = "Possibly MAR or MNAR (needs domain check)"

        summary.append({
            "Column Name": col,
            "Missing Count": missing_count,
            "Missing Data Percentage": round(missing_pct * 100, 2),
            "Likely Missing Data Type": missing_type
        })

    result_df = pd.DataFrame(summary).sort_values(by="Column Name", ascending=False).reset_index(drop=True)
    return result_df

# Generate and display the missing data summary for the newark_airport_df
newark_airport_missing_summary = missing_data_summary(newark_airport_df)

print("Missing data summary for newark_airport_df:")
newark_airport_missing_summary

Missing data summary for newark_airport_df:


Unnamed: 0,Column Name,Missing Count,Missing Data Percentage,Likely Missing Data Type
0,WSF5,2,0.55,Likely MCAR (Minimal Missingness)
1,WDF5,2,0.55,Likely MCAR (Minimal Missingness)
2,TSUN,366,100.0,Structurally Missing Data (always null)
3,PGTM,366,100.0,Structurally Missing Data (always null)


In [None]:
# Generate and display the missing data summary for the citibike_df
citibike_missing_summary = missing_data_summary(citibike_df)
print("Missing data summary for citibike_df:")
citibike_missing_summary

Missing data summary for citibike_df:


Unnamed: 0,Column Name,Missing Count,Missing Data Percentage,Likely Missing Data Type
0,User Type,380,0.15,Likely MCAR (Minimal Missingness)
1,Birth Year,18999,7.67,Possible MNAR (e.g. sensitive numeric)


#### Spot and confirm the data missing patterns

In [30]:
# Merge the two summaries into a single markdown table
merged_missing_summary = pd.concat([newark_airport_missing_summary, citibike_missing_summary], keys=['Newark Airport', 'Citi Bike']).reset_index(level=0).rename(columns={'level_0': 'DataFrame'})
# Create a markdown table from the merged summary
markdown_table = create_markdown_table(merged_missing_summary, "missing_data_summary")