In [None]:

#Imports 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
#load dataset
dataset = pd.read_csv('data/mibici_2014-2024.csv')
print(dataset.columns)

In [None]:
dataset_id_names = pd.read_csv('data/nomenclature_2024.csv')


In [None]:
#Find most popular stations
print(dataset['Origin_Id'].value_counts().head(5))
print(dataset['Destination_Id'].value_counts().head(5))

#Find most popular routes
print(dataset.groupby(['Origin_Id', 'Destination_Id']).size().sort_values(ascending=False).head(5))



In [None]:
#Add new columns to dataset
#Take Origin_Id and compare that to id in dataset_id_names
#Add Origin_Name column, and Origin_Lat, Origin_Lon
#Add Destination_Name column, and Destination_Lat, Destination_Lon
dataset['Origin_Name'] = dataset['Origin_Id'].map(dataset_id_names.set_index('id')['name'])
dataset['Origin_Lat'] = dataset['Origin_Id'].map(dataset_id_names.set_index('id')['latitude'])
dataset['Origin_Lon'] = dataset['Origin_Id'].map(dataset_id_names.set_index('id')['longitude'])
dataset['Destination_Name'] = dataset['Destination_Id'].map(dataset_id_names.set_index('id')['name'])
dataset['Destination_Lat'] = dataset['Destination_Id'].map(dataset_id_names.set_index('id')['latitude'])
dataset['Destination_Lon'] = dataset['Destination_Id'].map(dataset_id_names.set_index('id')['longitude'])


dataset['Total_degree_difference'] = np.abs(dataset['Origin_Lon'] - dataset['Destination_Lon']) + np.abs(dataset['Origin_Lat'] - dataset['Destination_Lat'])


print(dataset.head())

In [None]:
def reset_user_trips():
    #user_trips = dataset['User_Id'].value_counts() and add birth year column
    user_trips = dataset['User_Id'].value_counts()
    #Add birth year column to user_trips
    user_birth_year = dataset.groupby('User_Id')['Birth_year'].first()
    user_trips = pd.DataFrame(user_trips)
    user_trips['Birth_Year'] = user_birth_year
    user_trips = user_trips.reset_index()
    #SOrt by count column
    user_trips = user_trips.sort_values(by='count', ascending=False)

    
    #Make user_trips only include users born after 1970
    user_trips = user_trips[user_trips['Birth_Year'] > 1960]
    return user_trips

In [None]:

user_trips = reset_user_trips()

#Plot histogram of user birth years
plt.hist(user_trips['Birth_Year'], bins=64)
plt.xlabel('Birth Year')
plt.ylabel('Number of Users')
plt.title('Birth Year of Users')
plt.show()

#Show the quartiles of the birth years
print(user_trips['Birth_Year'].quantile([0.25, 0.5, 0.75]))
#graph quartiles
plt.boxplot(user_trips['Birth_Year'])
plt.ylabel('Birth Year')
plt.title('Quartiles of Birth Years')
plt.show()


#Show graph of total count of trips per user by birth year
user_trips = user_trips.groupby('Birth_Year')['count'].sum()
plt.plot(user_trips)
plt.xlabel('Birth Year')
plt.ylabel('Total Number of Trips')
plt.title('Total Number of Trips per User by Birth Year')
plt.show()



In [None]:
user_trips = reset_user_trips()

#Show graph of average count of trips per user by birth year
user_trips = user_trips.groupby('Birth_Year')['count'].mean()
plt.plot(user_trips)
plt.xlabel('Birth Year')
plt.ylabel('Average Number of Trips')
plt.title('Average Number of Trips per User by Birth Year')
plt.show()


In [None]:
#graph age to average duration

trip_ended = dataset['Trip_end']

year_ended = trip_ended.str.split('-').str[0]

dataset['Age'] = year_ended.astype(int) - dataset['Birth_year']

dataset.head()











In [None]:
#Show graph of average duration to number of trips

avg_duration = []

#convert '0 days 00:11:410' to seconds

dataset['Duration'] = pd.to_timedelta(dataset['Duration'])

print(dataset['Duration'].head())

for i in range(100):
    avg_duration.append(dataset[dataset['Age'] == i]['Duration'].mean().total_seconds())

#Convert to minutes
avg_duration = [x / 60 for x in avg_duration]




plt.plot(avg_duration)
plt.xlabel('Age')
plt.ylabel('Average Duration')
plt.title('Average Duration by Age')
plt.show()

In [None]:
#Find average difference in lat and long from origin to destination and graph it against age
avg_diff_lat = []
avg_diff_lon = []

avg_diff_total = []

avg_diff_total_miles = []

for i in range(100):
    avg_diff_lat.append(dataset[dataset['Age'] == i]['Total_degree_difference'].mean())
    avg_diff_lon.append(dataset[dataset['Age'] == i]['Total_degree_difference'].mean())
    avg_diff_total.append(dataset[dataset['Age'] == i]['Total_degree_difference'].mean())
    avg_diff_total_miles.append(dataset[dataset['Age'] == i]['Total_degree_difference'].mean() * 69)

plt.plot(avg_diff_total_miles)
plt.xlabel('Age')
plt.ylabel('Average Difference in Lat and Lon in Miles')
plt.title('Average Difference in Lat and Lon in Miles by Age')
plt.show()




In [None]:
#Remove nan values
avg_diff_total_miles = [x for x in avg_diff_total_miles if not np.isnan(x)]


#Print quartiles of average difference in lat and lon in miles
print(np.quantile(avg_diff_total_miles, [0.25, 0.5, 0.75]))
#boxplot of average difference in lat and lon in miles
plt.boxplot(avg_diff_total_miles)
