# **Citi Bike Data Engineering  - EDA - Duplicate Detection** 

#### Python Packages

In [28]:
import pandas as pd
import numpy as np
import csv
import sys
import os
import importlib
from itertools import combinations
from difflib import get_close_matches

#### Python Scripts

In [29]:
# Dynamically add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from scripts.create_markdown_table import create_markdown_table


#### Import DataFrames

In [30]:
df_dir = os.path.join(project_root, 'eda', 'dataframes')
newark_airport_df = pd.read_pickle(os.path.join(df_dir, 'newark_airport_df.pkl'))
citibike_df = pd.read_pickle(os.path.join(df_dir, 'citibike_df.pkl'))

# Confirm that the dataframes are loaded correctly
if newark_airport_df.empty or citibike_df.empty:
    raise ValueError("One or both dataframes are empty. Please check the data loading process.")
else:
    print("Dataframes loaded successfully.")


Dataframes loaded successfully.


#### Assess whether duplicate rows exist

In [36]:
# Return the number of duplicate rows in newark_airport_df
newark_airport_duplicates = newark_airport_df.duplicated().sum()
print(f"Number of duplicate rows in newark_airport_df: {newark_airport_duplicates}")
# Return the number of duplicate rows in citibike_df
citibike_duplicates = citibike_df.duplicated().sum()
print(f"Number of duplicate rows in citibike_df: {citibike_duplicates}")

# Create a markdown table to return the results of the duplicate counts
duplicate_counts = {
    'DataFrame': ['newark_airport_df', 'citibike_df'],
    'Duplicate Rows': [newark_airport_duplicates, citibike_duplicates]
}
duplicate_counts_df = pd.DataFrame(duplicate_counts)
markdown_table = create_markdown_table(duplicate_counts_df, 'duplicate_counts_table')


Number of duplicate rows in newark_airport_df: 0
Number of duplicate rows in citibike_df: 0


#### Determine uniqueness of identifiers

In [32]:
# Create a global dataframe to store the results from the newark_airport to be used in the markdown table
pk_results_newark_df = pd.DataFrame(columns=['Dataframe', 'Potential Primary Key(s)'])

# Write a function that identifies potential primary keys in a dataframe
def find_primary_keys(df):
    num_rows = len(df)
    columns = df.columns.tolist()

    # Check 1: Check individual columns
    for col in columns:
        if df[col].is_unique and df[col].notna().all():
            return [col]

    # Check 2: Attempting combining columns to identify a primary key
    for r in range(2, len(columns) + 1):
        cols = columns[:r]
        combined = df[cols].astype(str).agg('-'.join, axis=1)
        if combined.is_unique:
            return cols

    # Check 3: If there are no unique combinations, return nothing
    return None

# Find potential primary keys in newark_airport_df
newark_airport_primary_keys = find_primary_keys(newark_airport_df)
if newark_airport_primary_keys:
    print(f"Potential primary keys in newark_airport_df: {newark_airport_primary_keys}")
else:
    print("No potential primary keys found in newark_airport_df.")

# Add the results to the global dataframe
pk_results_newark_df = pd.concat([pk_results_newark_df, pd.DataFrame({
    'Dataframe': ['newark_airport_df'],
    'Potential Primary Key(s)': [', '.join(newark_airport_primary_keys)]
})], ignore_index=True)

# Since column 'DATE' is identified as a potential primary key, we will drop it for the next check
sub_newark_airport_df = newark_airport_df.drop(columns=['DATE'])
test_df_primary_keys = find_primary_keys(sub_newark_airport_df)
if test_df_primary_keys:
    print(f"Potential primary keys in test_df: {test_df_primary_keys}")
else:
    print("No potential primary keys found in test_df.")

# Add the results to the global dataframe
pk_results_newark_df = pd.concat([
    pk_results_newark_df,
    pd.DataFrame([{
        'Dataframe': 'newark_airport_df',
        'Potential Primary Key(s)': ', '.join(test_df_primary_keys)
    }])
], ignore_index=True)

Potential primary keys in newark_airport_df: ['DATE']


Potential primary keys in test_df: ['STATION', 'NAME', 'AWND', 'PGTM', 'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN']


In [34]:
# Create a global dataframe to store the results from the citibike to be used in the markdown table
pk_results_citibike_df = pd.DataFrame(columns=['Dataframe', 'Potential Primary Key(s)'])

# Find potential primary keys in citibike_df
citibike_primary_keys = find_primary_keys(citibike_df)
if citibike_primary_keys:
    print(f"Potential primary keys in citibike_df: {citibike_primary_keys}")
else:
    print("No potential primary keys found in citibike_df.")

# Add the results to the global dataframe
pk_results_citibike_df = pd.concat([pk_results_citibike_df, pd.DataFrame({
    'Dataframe': ['citibike_df'],
    'Potential Primary Key(s)': [', '.join(citibike_primary_keys)]
})], ignore_index=True)

# Since column 'index' is identified as a potential primary key, we will drop it for the next check
sub_citibike_df = citibike_df.drop(columns=['index'])
test_df_primary_keys = find_primary_keys(sub_citibike_df)
if test_df_primary_keys:
    print(f"Potential primary keys in test_df: {test_df_primary_keys}")
else:
    print("No potential primary keys found in test_df.")

# Add the results to the global dataframe
pk_results_citibike_df = pd.concat([
    pk_results_citibike_df,
    pd.DataFrame([{
        'Dataframe': 'citibike_df',
        'Potential Primary Key(s)': ', '.join(test_df_primary_keys)
    }])
], ignore_index=True)


Potential primary keys in citibike_df: ['index']
Potential primary keys in test_df: ['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID', 'Start Station Name', 'Start Station Latitude', 'Start Station Longitude', 'End Station ID', 'End Station Name', 'End Station Latitude', 'End Station Longitude', 'Bike ID']


In [None]:
# Merge the results from both dataframes into a single dataframe
pk_results_df = pd.concat([pk_results_newark_df, pk_results_citibike_df], ignore_index=True)

# Create a markdown table from the results
markdown_table = create_markdown_table(pk_results_df, 'potential_primary_keys')