# **Citi Bike Data Engineering  - EDA - Inspecting Data Structures** 

#### Python Packages

In [53]:
import pandas as pd
import numpy as np
import csv
import sys
import os
import importlib
from difflib import get_close_matches

#### Python Scripts

In [54]:
# Dynamically add the project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from scripts.create_markdown_table import create_markdown_table


#### Load CSV Files into DataFrames

In [55]:
# Create directory to the CSV files
dir_path = '../../data-sources/data/'

# Create a function to read CSV files and return a DataFrame
def read_csv_file(file_name):
    file_path = dir_path + file_name
    try:
        df = pd.read_csv(file_path, index_col=0)
        return df
    except FileNotFoundError:
        print(f"File {file_name} not found in directory {dir_path}.")
        return None

# Assign the CSV files to variables
newark_airport_df = read_csv_file('newark_airport_2016.csv')
citibike_201601 = read_csv_file('JC-201601-citibike-tripdata.csv')
citibike_201602 = read_csv_file('JC-201602-citibike-tripdata.csv')
citibike_201603 = read_csv_file('JC-201603-citibike-tripdata.csv')
citibike_201604 = read_csv_file('JC-201604-citibike-tripdata.csv')
citibike_201605 = read_csv_file('JC-201605-citibike-tripdata.csv')
citibike_201606 = read_csv_file('JC-201606-citibike-tripdata.csv')
citibike_201607 = read_csv_file('JC-201607-citibike-tripdata.csv')
citibike_201608 = read_csv_file('JC-201608-citibike-tripdata.csv')
citibike_201609 = read_csv_file('JC-201609-citibike-tripdata.csv')
citibike_201610 = read_csv_file('JC-201610-citibike-tripdata.csv')
citibike_201611 = read_csv_file('JC-201611-citibike-tripdata.csv')
citibike_201612 = read_csv_file('JC-201612-citibike-tripdata.csv')


In [56]:
# Print the first few rows of the newark_airport DataFrame
print("First few rows of newark_airport_df:")
print(newark_airport_df.head())

# Assess the shape of new_airport DataFrame
print(f"Shape of newark_airport_df: {newark_airport_df.shape}")

First few rows of newark_airport_df:
                                                    NAME        DATE   AWND  \
STATION                                                                       
USW00014734  NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US  2016-01-01  12.75   
USW00014734  NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US  2016-01-02   9.40   
USW00014734  NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US  2016-01-03  10.29   
USW00014734  NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US  2016-01-04  17.22   
USW00014734  NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US  2016-01-05   9.84   

             PGTM  PRCP  SNOW  SNWD  TAVG  TMAX  TMIN  TSUN  WDF2   WDF5  \
STATION                                                                    
USW00014734   NaN   0.0   0.0   0.0    41    43    34   NaN   270  280.0   
USW00014734   NaN   0.0   0.0   0.0    36    42    30   NaN   260  260.0   
USW00014734   NaN   0.0   0.0   0.0    37    47    28   NaN   270  250.0   
USW00014734   NaN   0.0   0.0

In [57]:
'''
Due to the large size of each CSV file under the 'citibike...csv' naming convention, this following code
will evaluate the shape of each DataFrame to assess whether them would present any performance or structural issues.
'''
# List all the citibike DataFrames for easier management
citibike_dfs = [
    citibike_201601, citibike_201602, citibike_201603, citibike_201604,
    citibike_201605, citibike_201606, citibike_201607, citibike_201608,
    citibike_201609, citibike_201610, citibike_201611, citibike_201612
]
# Print and assess the number of rows
row_count = 0
for i, df in enumerate(citibike_dfs, start=1):
    if df is not None:
        row_count += df.shape[0]
        print(f"Shape of citibike_20160{i}: {df.shape}")
    else:
        print(f"citibike_20160{i} DataFrame is None.")
print(f"Total rows across all citibike DataFrames: {row_count}")


Shape of citibike_201601: (7479, 14)
Shape of citibike_201602: (8250, 14)
Shape of citibike_201603: (13511, 14)
Shape of citibike_201604: (16342, 14)
Shape of citibike_201605: (19488, 14)
Shape of citibike_201606: (23947, 14)
Shape of citibike_201607: (24436, 14)
Shape of citibike_201608: (34149, 14)
Shape of citibike_201609: (33425, 14)
Shape of citibike_2016010: (29611, 14)
Shape of citibike_2016011: (21832, 14)
Shape of citibike_2016012: (15114, 14)
Total rows across all citibike DataFrames: 247584


In [58]:
# Assumption: Each citibike DataFrame has a consistent column structure and can be stacked without issues.
# Create a function that confirms whether the Assumption is valid
def check_column_structure_consistency(dfs):
    # If the list of DataFrames is empty, return a message
    if not dfs:
        return "No DataFrames provided."
    # Create a list of column names and data types for the first DataFrame
    init_columns = dfs[0].columns.tolist()
    init_dtypes = dfs[0].dtypes.tolist()
    # If init_columns and init_ are None, return an message
    if init_columns is None or init_dtypes is None:
        return "Initial DataFrame has no columns or data types."
    # Write a for loop that compares init_columns and init_dtypes with the rest of the DataFrames
    for i, df in enumerate(dfs, start=1):
        count = 0
        comp_columns = df.columns.tolist()
        comp_dtypes = df.dtypes.tolist()
        if comp_columns != init_columns or comp_dtypes != init_dtypes:
            return f"DataFrame citibike_20160{i} has a different structure compared to DataFrame citibike_201601."
            count += 1
    if count == 0:
        return "All DataFrames have a consistent column structure and data types."


check_column_structure_consistency(citibike_dfs)

'All DataFrames have a consistent column structure and data types.'

In [59]:
# Now that we have confirmed the structure consistency, we can concatenate the DataFrames
cleaned_dfs = []
for df in citibike_dfs:
    # Reset existing index to avoid 'level_0' or other named index issues
    df = df.reset_index(drop=False)

    # Ensure 'Trip Duration' is not being used as the index
    if 'Trip Duration' not in df.columns:
        raise ValueError("'Trip Duration' column not found after reset. Check source data.")

    cleaned_dfs.append(df)

# Concatenate all the citibike DataFrames into a single DataFrame
citibike_df = pd.concat(cleaned_dfs, ignore_index=True)

# Assign a new index to the concatenated DataFrame
citibike_df.insert(0, "index", range(len(citibike_df)))

# Print the shape of the concatenated DataFrame
print(f"Shape of concatenated citibike DataFrame: {citibike_df.shape}")

# Check the shape of the concatenated DataFrame matches the expected structure
# Minus one column for the original index
if citibike_df.shape[0] == row_count and citibike_df.shape[1] - 1 == len(citibike_201601.columns):
    print("All DataFrames have been concatenated successfully.")
else:
    print("There is a mismatch in the number of rows or columns after concatenation.")

# Print the first few rows of the concatenated DataFrame
print("First few rows of concatenated citibike_df:")
print(citibike_df.head())

Shape of concatenated citibike DataFrame: (247584, 16)
There is a mismatch in the number of rows or columns after concatenation.
First few rows of concatenated citibike_df:
   index  Trip Duration           Start Time            Stop Time  \
0      0            362  2016-01-01 00:02:52  2016-01-01 00:08:54   
1      1            200  2016-01-01 00:18:22  2016-01-01 00:21:42   
2      2            202  2016-01-01 00:18:25  2016-01-01 00:21:47   
3      3            248  2016-01-01 00:23:13  2016-01-01 00:27:21   
4      4            903  2016-01-01 01:03:20  2016-01-01 01:18:24   

   Start Station ID Start Station Name  Start Station Latitude  \
0              3186      Grove St PATH               40.719586   
1              3186      Grove St PATH               40.719586   
2              3186      Grove St PATH               40.719586   
3              3209       Brunswick St               40.724176   
4              3195            Sip Ave               40.730743   

   Start Statio

#### Evaluate consistency with provided data dictionaries

In [60]:
# Create a dictionary of the column descriptions for the Newark Airport dataset, referring to weather.pdf
# Assumption: All values are in American units, no description that confirms within data dictionary
newark_column_descriptions = {
    'STATION': ' The station identification code',
    'NAME': 'Name of the station (Newark Airport)',
    'DATE': 'Date of the observation (YYYY-MM-DD)',
    'AWND': 'Average wind speed (miles per hour)',
    'PGTM': 'Peak gust time (HHMM)',
    'PRCP': 'Precipitation (inches)',
    'SNOW': 'Snowfall (inches)',
    'SNWD': 'Snow depth (inches)',
    'TAVG': 'Average temperature (degrees Fahrenheit)',
    'TMAX': 'Maximum temperature (degrees Fahrenheit)',
    'TMIN': 'Minimum temperature (degrees Fahrenheit)',
    'TSUN': 'Total sunshine (minutes)',
    'WDF2': 'Direction of fastest 2-minute wind (degrees)',
    'WDF5': 'Direction of fastest 5-second wind (degrees)',
    'WSF2': 'Fastest 2-minute wind speed (miles per hour)',
    'WSF5': 'Fastest 5-second wind speed (miles per hour)'
}

In [61]:
# Create a dictionary of the column descriptions for the Newark Airport dataset, referring to weather.pdf
# Assumption: All values are in American units, no description that confirms within data dictionary
citibike_column_descriptions = {
    'index': 'Original index of the DataFrame',
    'Trip Duration': 'Duration of the trip in seconds',
    'Start Time': 'Start time of the trip (YYYY-MM-DD HH:MM:SS)',
    'Stop Time': 'End time of the trip (YYYY-MM-DD HH:MM:SS)',
    'Start Station ID': 'Unique identifier for the start station',
    'Start Station Name': 'Name of the start station',
    'Start Station Latitude': 'Latitude of the start station',
    'Start Station Longitude': 'Longitude of the start station',
    'End Station ID': 'Unique identifier for the end station',
    'End Station Name': 'Name of the end station',
    'End Station Latitude': 'Latitude of the end station',
    'End Station Longitude': 'Longitude of the end station',
    'Bike ID': 'Unique identifier for the bike used in the trip',
    'User Type': 'Type of user (Customer = 24-hour pass or 3-day pass user; Subscriber = Annual Member)',
    'Birth Year': 'Year of birth of the user',
    'Gender': 'Gender of the user (0: Unknown, 1: Male, 2: Female)'
}

#### Create lists of column names, data types and inferred domains

##### Newark Airport DataFrame

In [62]:
'''
Create a Dataframe that provides the following information for newark_airport_df:
    - Column Name
    - Data Type
    - Column Description
    - Value Examples
    - Value Distinct Count
'''

# Create a summary DataFrame for the Newark Airport dataset
# Add the column descriptions to the summary DataFrame
summary = []
newark_airport_df = newark_airport_df.reset_index()
for col in newark_airport_df.columns:
    summary.append({
        'Column Name': col,
        'Description': newark_column_descriptions.get(col, ''),
        'Data Type': newark_airport_df[col].dtype,
        'Value Examples': newark_airport_df[col].dropna().unique()[:3],
        'Value Distinct Count': newark_airport_df[col].nunique()
    })
newark_airport_shape = pd.DataFrame(summary)

# Print the newark_airport_shape DataFrame
print("Newark Airport DataFrame Information:")
print(newark_airport_shape)

Newark Airport DataFrame Information:
   Column Name                                   Description Data Type  \
0      STATION               The station identification code    object   
1         NAME          Name of the station (Newark Airport)    object   
2         DATE          Date of the observation (YYYY-MM-DD)    object   
3         AWND           Average wind speed (miles per hour)   float64   
4         PGTM                         Peak gust time (HHMM)   float64   
5         PRCP                        Precipitation (inches)   float64   
6         SNOW                             Snowfall (inches)   float64   
7         SNWD                           Snow depth (inches)   float64   
8         TAVG      Average temperature (degrees Fahrenheit)     int64   
9         TMAX      Maximum temperature (degrees Fahrenheit)     int64   
10        TMIN      Minimum temperature (degrees Fahrenheit)     int64   
11        TSUN                      Total sunshine (minutes)   float64   


In [63]:
# Save the newark_airport_shape DataFrame as a markdown table for the eda_insights.md file
create_markdown_table(newark_airport_shape, "newark_airport_shape")

"Markdown table 'newark_airport_shape' created successfully in 'outputs/'."

##### Citi Bike DataFrame

In [64]:
'''
Create a Dataframe that provides the following information for citibike_df:
    - Column Name
    - Data Type
    - Column Description
    - Value Examples
    - Value Distinct Count
'''

# Create a summary DataFrame for the Citi Bike DataFrame
# Add the column descriptions to the summary DataFrame
summary = []
for col in citibike_df.columns:
    summary.append({
        'Column Name': col,
        'Description': citibike_column_descriptions.get(col, ''),
        'Data Type': citibike_df[col].dtype,
        'Value Examples': citibike_df[col].dropna().unique()[:3],
        'Value Distinct Count': citibike_df[col].nunique()
    })
citibike_shape = pd.DataFrame(summary)

# Print the citibike_shape DataFrame
print("Citi Bike DataFrame Information:")
print(citibike_shape)

Citi Bike DataFrame Information:
                Column Name  \
0                     index   
1             Trip Duration   
2                Start Time   
3                 Stop Time   
4          Start Station ID   
5        Start Station Name   
6    Start Station Latitude   
7   Start Station Longitude   
8            End Station ID   
9          End Station Name   
10     End Station Latitude   
11    End Station Longitude   
12                  Bike ID   
13                User Type   
14               Birth Year   
15                   Gender   

                                          Description Data Type  \
0                     Original index of the DataFrame     int64   
1                     Duration of the trip in seconds     int64   
2        Start time of the trip (YYYY-MM-DD HH:MM:SS)    object   
3          End time of the trip (YYYY-MM-DD HH:MM:SS)    object   
4             Unique identifier for the start station     int64   
5                           Name of t

In [65]:
# Save the citibike_shape DataFrame as a markdown table for the eda_insights.md file
create_markdown_table(citibike_shape, "citibike_shape")

"Markdown table 'citibike_shape' created successfully in 'outputs/'."

#### Detect column groupings for entity relationships

In [66]:
# Create function that uses get_closest_match to identify columns with similar matches between newark_airport_df and citibike_df
def find_similar_column_names(df1, df2, cutoff=0.75):
    similar_cols = []
    for col1 in df1.columns:
        matches = get_close_matches(col1, df2.columns, n=3, cutoff=cutoff)
        for match in matches:
            similar_cols.append({
                "df1_column": col1,
                "df2_column": match,
                "note": "similar name"
            })
    if not similar_cols:
        print("No similar column names found.")
    else:
        print("Similar column names found:")
        print(similar_cols)

find_similar_column_names(newark_airport_df, citibike_df)

No similar column names found.


In [67]:
# After reviewing the both DataFrames, the best approach to build a relationship would be use 'DATE' from 'newark_airport_df' and 'Start Time' from 'citibike_df'
# Create a new column in citibike_df that contains only the date part of 'Start Time'
citibike_df['Trip Date'] = pd.to_datetime(citibike_df['Start Time']).dt.date
# Ensure that the DATE column in newark_airport_df is also in date format
newark_airport_df['DATE'] = pd.to_datetime(newark_airport_df['DATE']).dt.date

# Create a function that returns a DataFrame of columns with similar values between newark_airpor_df and citibike_df
def possible_relationships(df1, df2, df1_name, df2_name, threshold=0.75):
    relationships = []

    for col1 in df1.columns:
        unique_vals_1 = df1[col1].dropna().unique()

        for col2 in df2.columns:
            unique_vals_2 = df2[col2].dropna().unique()
            # Calculate the containment ratio
            if len(unique_vals_1) == 0:
                continue

            match_ratio = sum(np.isin(unique_vals_1, unique_vals_2)) / len(unique_vals_1)
            if match_ratio >= threshold:
                relationships.append({
                    "df1_column": f"{df1_name}.{col1}",
                    "df2_column": f"{df2_name}.{col2}",
                    "match_ratio": round(match_ratio, 3),
                    "comment": "Possible foreign key match"
                })

    return pd.DataFrame(relationships)

possible_fk_df = possible_relationships(citibike_df, newark_airport_df, "citibike_df", "newark_airport_df", threshold=0.75)

print(possible_fk_df)

              df1_column              df2_column  match_ratio  \
0  citibike_df.Trip Date  newark_airport_df.DATE          1.0   

                      comment  
0  Possible foreign key match  


In [68]:
# Save the result from possible_relationships as a markdown table for the eda_insights.md file
create_markdown_table(possible_fk_df, "possible_fk_relationships")

"Markdown table 'possible_fk_relationships' created successfully in 'outputs/'."

In [69]:
# Finally, save the cleaned DataFrames to pickle files for future use
df_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataframes'))
newark_airport_df.to_pickle(os.path.join(df_dir, "newark_airport_df.pkl"))
citibike_df.to_pickle(os.path.join(df_dir, "citibike_df.pkl"))
# Print a message indicating that the DataFrames have been saved