In [1]:
import pandas as pd
import glob
from tqdm import tqdm

In [2]:
# Folder Path on Google Drive
folder_path = "/content/drive/MyDrive/Colab/NU/IE6200/trip_data/"

# Get all CSV files in the folder
csv_files = glob.glob(folder_path + "*.csv")

# Initialize an empty list to store dataframes
dfs = []

# Loop over each file with tqdm progress bar, load it, rename columns, and append to the list
for file in tqdm(csv_files, desc="Loading and Processing CSV Files"):
    df = pd.read_csv(file)
    # Rename the columns
    df = df.rename(columns={
        "member_casual": "rider_type",
        "rideable_type": "bike_type"
    })
    # Append to the list of dataframes
    dfs.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Sort the combined DataFrame by "started_at" column
combined_df = combined_df.sort_values(by="started_at").reset_index(drop=True)

Loading and Processing CSV Files: 100%|██████████| 10/10 [00:26<00:00,  2.67s/it]


In [4]:
combined_df.shape

(3768997, 13)

In [3]:
# Missing values
missing_values = combined_df.isnull().sum()
missing_values

Unnamed: 0,0
ride_id,0
bike_type,0
started_at,0
ended_at,0
start_station_name,1060
start_station_id,1060
end_station_name,7026
end_station_id,7175
start_lat,0
start_lng,0


In [5]:
# Remove rows with any missing values
combined_df = combined_df.dropna()

In [20]:
combined_df.head()

Unnamed: 0,ride_id,bike_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,rider_type
0,31F12D722DEC2205,classic_bike,2023-12-01 00:00:47,2023-12-01 00:18:32,MIT Carleton St at Amherst St,M32070,Lower Cambridgeport at Magazine St / Riverside Rd,M32022,42.360541,-71.086698,42.357219,-71.113872,member
1,C7B5105B0B65C0F3,classic_bike,2023-12-01 00:02:04,2023-12-01 00:07:54,MIT at Mass Ave / Amherst St,M32006,MIT Vassar St,M32042,42.3581,-71.093198,42.355601,-71.103945,member
2,68B5F2A512F98D31,classic_bike,2023-12-01 00:02:11,2023-12-01 00:03:15,South Station - 700 Atlantic Ave,A32010,Boston Landing,A32045,42.352175,-71.055547,42.356561,-71.141675,member
3,498C4A8C729ED986,classic_bike,2023-12-01 00:04:19,2023-12-01 00:14:00,One Beacon St,B32061,Columbus Ave at W. Canton St,C32077,42.358477,-71.061351,42.344742,-71.076482,member
4,2B9A06F84509E2C0,classic_bike,2023-12-01 00:04:38,2023-12-01 00:16:16,Silber Way,D32032,Dartmouth St at Newbury St,D32045,42.349495,-71.100575,42.350961,-71.077828,member


In [7]:
# Most/least popular stations
combined_df["start_station_name"].value_counts()

Unnamed: 0_level_0,count
start_station_name,Unnamed: 1_level_1
MIT at Mass Ave / Amherst St,69503
Central Square at Mass Ave / Essex St,56334
Harvard Square at Mass Ave/ Dunster,49570
MIT Pacific St at Purrington St,39844
Charles Circle - Charles St at Cambridge St,39659
...,...
Winthrop Circle,11
Damrell st at Old Colony Ave,6
Chestnut Hill Ave. at Ledgemere Road,5
Centre St. at Allandale St.,2


In [8]:
def validate_station_mappings(dataframe):
    """
    Validate Station Name-to-ID mappings for both start and end stations.

    This function checks for inconsistencies between station names and station IDs by identifying
    cases where a station name is associated with multiple station IDs or where a station ID is
    associated with multiple station names. The results are printed to the console.

    Parameters:
        dataframe (pd.DataFrame): The DataFrame containing bike trip data with station names and IDs.
    """

    # Create dictionaries to map start station names to IDs and start station IDs to names
    start_name_to_ids = dataframe.groupby("start_station_name")["start_station_id"].unique().to_dict()
    start_id_to_names = dataframe.groupby("start_station_id")["start_station_name"].unique().to_dict()

    # Create dictionaries to map end station names to IDs and end station IDs to names
    end_name_to_ids = dataframe.groupby("end_station_name")["end_station_id"].unique().to_dict()
    end_id_to_names = dataframe.groupby("end_station_id")["end_station_name"].unique().to_dict()

    # Check for mismatches in start stations (name to multiple IDs)
    print("Start Station Names Mapping to Multiple IDs:")
    if any(len(ids) > 1 for ids in start_name_to_ids.values()):
        for name, ids in start_name_to_ids.items():
            if len(ids) > 1:
                print(f"{name}: {list(ids)} (Instances: {len(ids)})")
    else:
        print("nil")

    # Check for mismatches in start stations (ID to multiple names)
    print("\nStart Station IDs Mapping to Multiple Names:")
    if any(len(names) > 1 for names in start_id_to_names.values()):
        for station_id, names in start_id_to_names.items():
            if len(names) > 1:
                print(f"{station_id}: {list(names)} (Instances: {len(names)})")
    else:
        print("nil")

    # Check for mismatches in end stations (name to multiple IDs)
    print("\nEnd Station Names Mapping to Multiple IDs:")
    if any(len(ids) > 1 for ids in end_name_to_ids.values()):
        for name, ids in end_name_to_ids.items():
            if len(ids) > 1:
                print(f"{name}: {list(ids)} (Instances: {len(ids)})")
    else:
        print("nil")

    # Check for mismatches in end stations (ID to multiple names)
    print("\nEnd Station IDs Mapping to Multiple Names:")
    if any(len(names) > 1 for names in end_id_to_names.values()):
        for station_id, names in end_id_to_names.items():
            if len(names) > 1:
                print(f"{station_id}: {list(names)} (Instances: {len(names)})")
    else:
        print("nil")

# Call the function with your DataFrame
validate_station_mappings(combined_df)

Start Station Names Mapping to Multiple IDs:
Somerville Hospital: ['S32020', 'S32052'] (Instances: 2)
Tremont St at Court St: ['A32046', 'A32058'] (Instances: 2)

Start Station IDs Mapping to Multiple Names:
A32046: ['Tremont St at Court St', 'Canal St. at Causeway St.', 'Canal St at Causeway St'] (Instances: 3)
A32058: ['Tremont St. at Court St.', 'Tremont St at Court St'] (Instances: 2)
B32038: ['Chestnut Hill Ave. at Ledgemere Road', 'Chestnut Hill Ave at Ledgemere Rd'] (Instances: 2)
C32109: ['Centre St. at Allandale St.', 'Centre St at Allandale St'] (Instances: 2)
E32003: ['Hyde Square - Barbara St at Centre St', 'Hyde Square - Centre St at Perkins St'] (Instances: 2)
L32007: ['Swan Pl. at Minuteman Bikeway', 'Swan Place at Minuteman Bikeway'] (Instances: 2)
M32019: ['CambridgeSide Galleria - CambridgeSide PL at Land Blvd', 'Cambridgeside Pl at Land Blvd'] (Instances: 2)
S32052: ['Summer St at Quincy St', 'Somerville Hospital'] (Instances: 2)
V32003: ['Everett Square (Broadway at

In [20]:
# Delete trips starting or ending at station ID "S32020" (this station doesn't exist)
start_S32020 = combined_df["start_station_id"].value_counts().get("S32020", 0)
end_S32020 = combined_df["end_station_id"].value_counts().get("S32020", 0)

S32020_mismatch = start_S32020 + end_S32020
print(f"{S32020_mismatch} of the trips start or end at station S32020.")

combined_df = combined_df[~((combined_df["start_station_id"] == "S32020") |
                            (combined_df["end_station_id"] == "S32020"))]

138 of the trips start or end at station S32020.


In [10]:
# Count instances of "A32046" under start and end station IDs for the name "Tremont St at Court St"
count_start = combined_df[(combined_df["start_station_id"] == "A32046") &
                          (combined_df["start_station_name"] == "Tremont St at Court St")].shape[0]

count_end = combined_df[(combined_df["end_station_id"] == "A32046") &
                        (combined_df["end_station_name"] == "Tremont St at Court St")].shape[0]

# Total instances
total_count = count_start + count_end
print(f"Total instances of station ID A32046 mismatched with Tremont St at Court St is {total_count}.")

# Update start_station_name and end_station_name for "A32046" instances
# Station ID A32046 is for "Canal St at Causeway St" not "Tremont St at Court St"

combined_df.loc[(combined_df["start_station_id"] == "A32046") &
                (combined_df["start_station_name"] == "Tremont St at Court St"),
                "start_station_name"] = "Canal St at Causeway St"

combined_df.loc[(combined_df["end_station_id"] == "A32046") &
                (combined_df["end_station_name"] == "Tremont St at Court St"),
                "end_station_name"] = "Canal St at Causeway St"

Total instances of station ID A32046 mismatched with Tremont St at Court St is 34.


In [11]:
def update_station_names(dataframe, old_name, new_name):
    """
    Update station names in both start and end station columns of the DataFrame.

    This function replaces occurrences of the specified old station name with the new station name
    in both 'start_station_name' and 'end_station_name' columns of the DataFrame.

    Parameters:
        dataframe (pd.DataFrame): The DataFrame containing bike trip data with station names.
        old_name (str): The old station name to be replaced.
        new_name (str): The new station name to replace with.
    """
    # Update both start and end station names
    dataframe.loc[dataframe["start_station_name"] == old_name, "start_station_name"] = new_name
    dataframe.loc[dataframe["end_station_name"] == old_name, "end_station_name"] = new_name

# List of changes to be made
changes = [
    ['Canal St. at Causeway St.', 'Canal St at Causeway St'],
    ['Tremont St. at Court St.', 'Tremont St at Court St'],
    ['Chestnut Hill Ave. at Ledgemere Road', 'Chestnut Hill Ave at Ledgemere Rd'],
    ['Centre St. at Allandale St.', 'Centre St at Allandale St'],
    ['Hyde Square - Barbara St at Centre St', 'Hyde Square - Centre St at Perkins St'],
    ['Swan Pl. at Minuteman Bikeway', 'Swan Place at Minuteman Bikeway'],
    ['CambridgeSide Galleria - CambridgeSide PL at Land Blvd', 'Cambridgeside Pl at Land Blvd'],
    ['Summer St at Quincy St', 'Somerville Hospital'],
    ['Everett Square (Broadway at Chelsea St)', 'Everett Square (Broadway at Norwood St)'],
    ['Damrell st at Old Colony Ave', 'Damrell St at Old Colony Ave']
]

# Applying the changes to the DataFrame
for old_name, new_name in changes:
    update_station_names(combined_df, old_name, new_name)

# Confirmation message
print("Station names updated successfully.")

Station names updated successfully.


In [12]:
# Check for mismatch again
validate_station_mappings(combined_df)

Start Station Names Mapping to Multiple IDs:
nil

Start Station IDs Mapping to Multiple Names:
nil

End Station Names Mapping to Multiple IDs:
nil

End Station IDs Mapping to Multiple Names:
A32046: ['Canal St\xa0at\xa0Causeway\xa0St', 'Canal St at Causeway St'] (Instances: 2)


In [13]:
"""
The \xa0 character is the Unicode representation for a non-breaking space (NBSP). This character
is different from a regular space ( ' ' , Unicode U+0020 ) although they appear the same.
"""

# Standardize station names for A32046
standard_name = 'Canal St at Causeway St'

# Identify and replace the inconsistent names
combined_df.loc[
    (combined_df['end_station_id'] == 'A32046') &
    (combined_df['end_station_name'].isin(['Canal St at Causeway St',
    'Canal St\xa0at\xa0Causeway\xa0St'])), 'end_station_name'] = standard_name

In [14]:
# Final check for mismatch again
validate_station_mappings(combined_df)

Start Station Names Mapping to Multiple IDs:
nil

Start Station IDs Mapping to Multiple Names:
nil

End Station Names Mapping to Multiple IDs:
nil

End Station IDs Mapping to Multiple Names:
nil


In [23]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Automatic Data Profiling", explorative=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]