In [25]:
# Import the dependencies
import pandas as pd
import os

In [26]:
# Folder path
folder_path = "/Users/harwinder/Desktop/Tableau_Project/Resources/"
print(os.listdir(folder_path))  # List files in the folder

['Jan_2024.csv', 'nov_2024.csv', '.DS_Store', 'oct_2024.csv', 'July_2024.csv', 'sep_2024.csv']


In [27]:
# File paths
file_names = ['Jan_2024.csv', 'July_2024.csv']
file_paths = [os.path.join(folder_path, file_name) for file_name in file_names]

In [28]:
# Create an empty list to store DataFrames
data_frames = []

# Loop through each file and load only 5000 rows
for file_path in file_paths:
    print(f"Loading file: {file_path}")
    data = pd.read_csv(file_path, nrows=5000)  # Limit to 5000 rows
    data_frames.append(data)  # Append the DataFrame to the list

# Merge all DataFrames into one
merged_dataset = pd.concat(data_frames, ignore_index=True)

Loading file: /Users/harwinder/Desktop/Tableau_Project/Resources/Jan_2024.csv
Loading file: /Users/harwinder/Desktop/Tableau_Project/Resources/July_2024.csv


In [30]:
# Check the merged data
print(f"Number of rows in merged data: {len(merged_dataset)}")
print(merged_dataset.head())

# Save the merged file
output_file = os.path.join(folder_path, "seasonal_citibike_data.csv")
merged_dataset.to_csv(output_file, index=False)
print(f"Merged file saved to: {output_file}")

Number of rows in merged data: 10000
            ride_id  rideable_type               started_at  \
0  5078F3D302000BD2  electric_bike  2024-01-22 18:43:19.012   
1  814337105D37302A  electric_bike  2024-01-11 19:19:18.721   
2  A33A920E2B10710C  electric_bike  2024-01-30 19:17:41.693   
3  A3A5FC0DD7D34D74  electric_bike  2024-01-27 11:27:01.759   
4  6F96728ECEFBDAA4  electric_bike  2024-01-16 15:15:41.000   

                  ended_at                  start_station_name  \
0  2024-01-22 18:48:10.708  Frederick Douglass Blvd & W 145 St   
1  2024-01-11 19:47:36.007                     W 54 St & 6 Ave   
2  2024-01-30 19:32:49.857                     E 11 St & Ave B   
3  2024-01-27 11:38:01.213                     W 54 St & 6 Ave   
4  2024-01-16 15:29:26.156               Madison Ave & E 99 St   

  start_station_id            end_station_name end_station_id  start_lat  \
0          7954.12  St Nicholas Ave & W 126 St        7756.10  40.823072   
1          6771.13             E 74

In [31]:
# Reload the merged file for further processing
merged_dataset = pd.read_csv(output_file)
merged_dataset.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ride_id             10000 non-null  object 
 1   rideable_type       10000 non-null  object 
 2   started_at          10000 non-null  object 
 3   ended_at            10000 non-null  object 
 4   start_station_name  9993 non-null   object 
 5   start_station_id    9993 non-null   object 
 6   end_station_name    9858 non-null   object 
 7   end_station_id      9858 non-null   object 
 8   start_lat           10000 non-null  float64
 9   start_lng           10000 non-null  float64
 10  end_lat             9986 non-null   float64
 11  end_lng             9986 non-null   float64
 12  member_casual       10000 non-null  object 
 13  Unnamed: 0          5000 non-null   float64
dtypes: float64(5), object(9)
memory usage: 1.1+ MB


In [32]:
# Check for missing values
print("Missing values before handling:")
print(merged_dataset.isna().sum())

Missing values before handling:
ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name       7
start_station_id         7
end_station_name       142
end_station_id         142
start_lat                0
start_lng                0
end_lat                 14
end_lng                 14
member_casual            0
Unnamed: 0            5000
dtype: int64


In [33]:
# Handle missing values
merged_dataset['start_station_name'] = merged_dataset['start_station_name'].fillna('Unknown')
merged_dataset['end_station_name'] = merged_dataset['end_station_name'].fillna('Unknown')
merged_dataset['start_station_id'] = merged_dataset['start_station_id'].fillna(-1)
merged_dataset['end_station_id'] = merged_dataset['end_station_id'].fillna(-1)
merged_dataset = merged_dataset.dropna(subset=['end_lat', 'end_lng'])

In [34]:
# Drop unnecessary columns
merged_dataset = merged_dataset.drop(columns=['Unnamed: 0'], errors='ignore')

In [35]:
# Convert to datetime
merged_dataset['started_at'] = pd.to_datetime(merged_dataset['started_at'], errors='coerce')
merged_dataset['ended_at'] = pd.to_datetime(merged_dataset['ended_at'], errors='coerce')

# Ensure IDs are numeric
merged_dataset['start_station_id'] = pd.to_numeric(merged_dataset['start_station_id'], errors='coerce').fillna(-1).astype('int64')
merged_dataset['end_station_id'] = pd.to_numeric(merged_dataset['end_station_id'], errors='coerce').fillna(-1).astype('int64')


In [36]:
# Add new calculated columns
merged_dataset['trip_duration_minutes'] = (merged_dataset['ended_at'] - merged_dataset['started_at']).dt.total_seconds() / 60
merged_dataset['day_of_week'] = merged_dataset['started_at'].dt.day_name()
merged_dataset['hour_of_day'] = merged_dataset['started_at'].dt.hour


In [37]:
# Filter invalid data
merged_dataset = merged_dataset[merged_dataset['trip_duration_minutes'] > 0]

In [38]:
# Standardize text columns
merged_dataset['start_station_name'] = merged_dataset['start_station_name'].str.strip()
merged_dataset['end_station_name'] = merged_dataset['end_station_name'].str.strip()
merged_dataset['rideable_type'] = merged_dataset['rideable_type'].str.lower()
merged_dataset['member_casual'] = merged_dataset['member_casual'].str.lower()

In [39]:
# Rename columns
merged_dataset = merged_dataset.rename(columns={
    'ride_id': 'Ride Id', 'rideable_type': 'Rideable Type', 'started_at': 'Started At',
    'ended_at': 'Ended At', 'start_station_name': 'Start Station Name',
    'start_station_id': 'Start Station Id', 'end_station_name': 'End Station Name',
    'end_station_id': 'End Station Id', 'start_lat': 'Start Lat', 'start_lng': 'Start Lng',
    'end_lat': 'End Lat', 'end_lng': 'End Lng', 'member_casual': 'Member Casual'
})

In [40]:
# Check data types and ensure correctness
expected_dtypes = {
    'Ride Id': 'str',
    'Rideable Type': 'category',
    'Started At': 'datetime64[ns]',
    'Ended At': 'datetime64[ns]',
    'Start Station Name': 'str',
    'Start Station Id': 'int64',
    'End Station Name': 'str',
    'End Station Id': 'int64',
    'Start Lat': 'float64',
    'Start Lng': 'float64',
    'End Lat': 'float64',
    'End Lng': 'float64',
    'Member Casual': 'category',
    'trip_duration_minutes': 'float64',
    'day_of_week': 'str',
    'hour_of_day': 'int64'
}

for column, dtype in expected_dtypes.items():
    if column in merged_dataset.columns and merged_dataset[column].dtype != dtype:
        try:
            merged_dataset[column] = merged_dataset[column].astype(dtype)
        except Exception as e:
            print(f"Could not convert {column} to {dtype}: {e}")

In [41]:
# Verify final data types
print("\nFinal Data Types:\n")
print(merged_dataset.dtypes)





Final Data Types:

Ride Id                          object
Rideable Type                  category
Started At               datetime64[ns]
Ended At                 datetime64[ns]
Start Station Name               object
Start Station Id                  int64
End Station Name                 object
End Station Id                    int64
Start Lat                       float64
Start Lng                       float64
End Lat                         float64
End Lng                         float64
Member Casual                  category
trip_duration_minutes           float64
day_of_week                      object
hour_of_day                       int64
dtype: object


In [43]:
# Save the cleaned dataset
cleaned_file_path = "/Users/harwinder/Desktop/Tableau_Project/final_datasets/seasonal_final_citibike_dataset.csv"
merged_dataset.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to: {cleaned_file_path}")

Cleaned data saved to: /Users/harwinder/Desktop/Tableau_Project/final_datasets/seasonal_final_citibike_dataset.csv
