# Explore & Clean the Data

In [21]:
import glob 
import pandas as pd
import uuid

In [2]:
# Get a list of all CSV files in the specified directory
files = glob.glob("../data/JC-2016*.csv")
# Read all files and concatenate them into a single DataFrame
df_list = list()
# Loop through each file and read it into a DataFrame
for file in files:
    try:
        temp_df = pd.read_csv(file)
        df_list.append(temp_df)
    except pd.errors.EmptyDataError:
        print(f'File {file} is empty and will be skipped.')

# Concatenate all DataFrames into a single DataFrame
citibike_df = pd.concat(df_list, ignore_index=True)

In [3]:
print(f"Number of rows in the DataFrame: {citibike_df.shape[0]}\nNumber of columns in the DataFrame: {citibike_df.shape[1]}")

Number of rows in the DataFrame: 247584
Number of columns in the DataFrame: 15


In [4]:
description_df = pd.DataFrame({
    "attribute": citibike_df.columns,
    "null_values": citibike_df.isnull().sum(),
    "number_of_rows": citibike_df.shape[0],
    "data_type": citibike_df.dtypes,
    "unique_values": citibike_df.nunique(),
    "sample_values": [citibike_df[col].dropna().unique()[:5] for col in citibike_df.columns],
        
})

description_df

Unnamed: 0,attribute,null_values,number_of_rows,data_type,unique_values,sample_values
Trip Duration,Trip Duration,0,247584,int64,6024,"[362, 200, 202, 248, 903]"
Start Time,Start Time,0,247584,object,244407,"[2016-01-01 00:02:52, 2016-01-01 00:18:22, 201..."
Stop Time,Stop Time,0,247584,object,244137,"[2016-01-01 00:08:54, 2016-01-01 00:21:42, 201..."
Start Station ID,Start Station ID,0,247584,int64,51,"[3186, 3209, 3195, 3211, 3187]"
Start Station Name,Start Station Name,0,247584,object,51,"[Grove St PATH, Brunswick St, Sip Ave, Newark ..."
Start Station Latitude,Start Station Latitude,0,247584,float64,51,"[40.71958611647166, 40.7241765, 40.73074262530..."
Start Station Longitude,Start Station Longitude,0,247584,float64,51,"[-74.04311746358871, -74.0506564, -74.06378388..."
End Station ID,End Station ID,0,247584,int64,102,"[3209, 3213, 3203, 3210, 3214]"
End Station Name,End Station Name,0,247584,object,102,"[Brunswick St, Van Vorst Park, Hamilton Park, ..."
End Station Latitude,End Station Latitude,0,247584,float64,102,"[40.7241765, 40.71848892, 40.727595966, 40.742..."


In [None]:
# Remove rows where 'User Type' is NaN
citibike_df.dropna(subset=['User Type'], inplace=True)

# Calculate the mean birth year 
mean_birth_year = int(citibike_df['Birth Year'].mean())

# Fill NaN values in 'Birth Year' with the mean birth year
citibike_df['Birth Year'] = citibike_df['Birth Year'].fillna(mean_birth_year)

# Convert 'Trip Duration' from seconds to minutes and round to 2 decimal places
citibike_df['Trip Duration Minutes'] = (citibike_df['Trip Duration'] / 60).round(2)

# Map the genre values to more descriptive names
citibike_df['Gender Name'] = citibike_df['Gender'].map({
    0: 'Unknown',
    1: 'Male',
    2: 'Female'
})

In [14]:
citibike_df.columns

Index(['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID',
       'Start Station Name', 'Start Station Latitude',
       'Start Station Longitude', 'End Station ID', 'End Station Name',
       'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
       'Birth Year', 'Gender', 'Trip Duration Minutes', 'Gender Name'],
      dtype='object')

# Create a DB schema structure - Denormalize

In [22]:
# Create a new column 'Trip Id' starting from 1
citibike_df['Trip Id'] = [str(uuid.uuid4()) for _ in range(len(citibike_df))]

## Extract Trips data

In [26]:
trips_df = citibike_df[['Trip Id', 'Start Time', 'Stop Time', 'Trip Duration Minutes', 
                        'Trip Duration','Start Station ID', 'End Station ID', 
                        'Bike ID', 'User Type', 'Birth Year','Gender']]

## Extract stations data

In [None]:
start_stations = citibike_df[['Start Station ID', 'Start Station Name', 
                              'Start Station Latitude', 'Start Station Longitude']]
start_stations.columns = ['Station ID', 'Name', 'Latitude', 'Longitude']

end_stations = citibike_df[['End Station ID', 'End Station Name', 
                            'End Station Latitude', 'End Station Longitude']]
end_stations.columns = ['Station ID', 'Name', 'Latitude', 'Longitude']

stations_df = pd.concat([start_stations, end_stations]).drop_duplicates('Station ID')

## Extract bikes data

In [33]:
bikes_df = citibike_df[['Bike ID']].drop_duplicates().sort_values(by='Bike ID').reset_index(drop=True)

## Extract Genre data

In [38]:
genre_df = citibike_df[['Gender', 'Gender Name']].drop_duplicates().sort_values(by='Gender').reset_index(drop=True)