In [None]:
pip install download

In [None]:
from download import download
import pandas as pd
import numpy as np

In [None]:
# downloading the data from the location and storing it in the path tmp/aq
path = download('https://s3.amazonaws.com/tripdata/201806-citibike-tripdata.zip','/tmp/aq', kind = 'zip',replace=True)

1.preprocessing

In [None]:
data = pd.read_csv('/tmp/aq/201806-citibike-tripdata.csv')\
.rename(columns = {'birth year':'birth_year',})

In [None]:
data.head()

In [None]:
df = data[data.birth_year.notnull()]

In [None]:
df.head()

In [None]:
d=df.isna().sum()
d.head()

In [None]:
print(df.shape)


In [None]:
df.head()

In [None]:
df.tail()

EXPLORATORY DATA ANALYTICS


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Circle

#setting plot style to seaborn
plt.style.use('seaborn')

In [None]:
#converting string to datetime object
df['starttime']= pd.to_datetime(df['starttime'])

#since we are dealing with single month, we grouping by days
#using count aggregation to get number of occurances i.e, total trips per day
start_time_count = df.set_index('starttime').groupby(pd.Grouper(freq='D')).count()

#we have data from July month for only one day which is at last row, lets drop it
start_time_count.drop(start_time_count.tail(1).index, axis=0, inplace=True)

#again grouping by day and aggregating with sum to get total trip duration per day
#which will used while plotting
trip_duration_count = df.set_index('starttime').groupby(pd.Grouper(freq='D')).sum()

#again dropping the last row for same reason
trip_duration_count.drop(trip_duration_count.tail(1).index, axis=0, inplace=True)

#plotting total rides per day
#using start station id to get the count
fig,ax=plt.subplots(figsize=(25,10))
ax.bar(start_time_count.index, 'start station id', data=start_time_count, label='Total riders')
#bbox_to_anchor is to position the legend box
ax.legend(loc ="lower left", bbox_to_anchor=(0.01, 0.89), fontsize='20')
ax.set_xlabel('Days of the month June 2013', fontsize=30)
ax.set_ylabel('Riders',  fontsize=40)
ax.set_title('Bikers trend for the month June', fontsize=50)

#creating twin x axis to plot line chart is same figure
ax2=ax.twinx()
#plotting total trip duration of all user per day
ax2.plot('tripduration', data=trip_duration_count, color='y', label='Total trip duration', marker='o', linewidth=5, markersize=12)
ax2.set_ylabel('Time duration',  fontsize=40)
ax2.legend(loc ="upper left", bbox_to_anchor=(0.01, 0.9), fontsize='20')

ax.set_xticks(trip_duration_count.index)
ax.set_xticklabels([i for i in range(1,31)])

#tweeking x and y ticks labels of axes1
ax.tick_params(labelsize=30, labelcolor='#eb4034')
#tweeking x and y ticks labels of axes2
ax2.tick_params(labelsize=30, labelcolor='#eb4034')

plt.show()

In [None]:
#plotting total no.of males and females
splot = sns.countplot('gender', data=df)

#adding value above each bar:Annotation
for p in splot.patches:
    an = splot.annotate(format(p.get_height(), '.2f'),
                        #bar value is nothing but height of the bar
                       (p.get_x() + p.get_width() / 2., p.get_height()),
                       ha = 'center',
                       va = 'center',
                       xytext = (0, 10),
                       textcoords = 'offset points')
    an.set_size(20)#test size
splot.axes.set_title("Gender distribution",fontsize=30)
splot.axes.set_xlabel("Gender",fontsize=20)
splot.axes.set_ylabel("Count",fontsize=20)

#adding x tick values
splot.axes.set_xticklabels(['Unknown', 'Male', 'Female'])
plt.show()

In [None]:
#converting trip duration from seconds to minuits
df['tripduration'] = df['tripduration']/60

#creating bins (0-30min, 30-60min, 60-120min, 120 and above)
max_limit = df['tripduration'].max()
df['tripduration_bins'] = pd.cut(df['tripduration'], [0, 30, 60, 120, max_limit])

sns.barplot(x='tripduration_bins', y='tripduration', data=df, estimator=np.size)
plt.title('Usual trip duration', fontsize=30)
plt.xlabel('Trip duration group', fontsize=20)
plt.ylabel('Trip Duration', fontsize=20)
plt.show()

In [None]:
#number of trips that started and ended at same station
start_end_same = df[df['start station name'] == df['end station name']].shape[0]

#number of trips that started and ended at different station
start_end_diff = df.shape[0]-start_end_same

fig,ax=plt.subplots()
ax.pie([start_end_same,start_end_diff], labels=['Same', 'Different'], autopct='%1.2f%%', textprops={'fontsize': 20})
ax.set_title('Same start and end location vs Different start and end location', fontsize=20)


circle = Circle((0,0), 0.6, facecolor='white')
ax.add_artist(circle)

plt.show()

In [None]:
#top 10 start station
top_start_station = df['start station name'].value_counts()[:10]

fig,ax=plt.subplots(figsize=(20,8))
ax.bar(x=top_start_station.index, height=top_start_station.values, color='blue', width=0.5)

#adding value above each bar:Annotation
for p in ax.patches:
    an = ax.annotate(format(p.get_height(), '.2f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha = 'center',
                   va = 'center',
                   xytext = (0, 10),
                   textcoords = 'offset points')
    an.set_size(20)
ax.set_title("Top 10 start locations in NEW YORK CITY",fontsize=30)
ax.set_xlabel("Station name",fontsize=20)

#rotating the x tick labels to 45 degrees
ax.set_xticklabels(top_start_station.index, rotation = 45, ha="right")
ax.set_ylabel("Count",fontsize=20)
#tweeking x and y tick labels
ax.tick_params(labelsize=15)
plt.show()

In [None]:
#top 10 end station
top_end_station = df['end station name'].value_counts()[:10]

fig,ax=plt.subplots(figsize=(20,8))
ax.bar(x=top_end_station.index, height=top_end_station.values, color='red', width=0.5)

#adding value above each bar:Annotation
for p in ax.patches:
    an = ax.annotate(format(p.get_height(), '.2f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha = 'center',
                   va = 'center',
                   xytext = (0, 10),
                   textcoords = 'offset points')
    an.set_size(20)
ax.set_title("Top 10 end locations in NY",fontsize=30)
ax.set_xlabel("Station name",fontsize=20)

#rotating the x tick labels to 45 degrees
ax.set_xticklabels(top_end_station.index, rotation = 45, ha="right")
ax.set_ylabel("Count",fontsize=20)
#tweeking x and y tick labels
ax.tick_params(labelsize=15)
plt.show()

IMPLEMENTATION


1.NUMBER OF TRIPS

In [None]:
trips = pd.DataFrame() #creating a dataframe
trips['no_of_trips'] = df.groupby("bikeid")["bikeid"].count() #finding the number of trips by each bike
trips['avg_duration'] = df.groupby("bikeid")["tripduration"].mean() #avg duration of the trips


In [None]:
trips

In [None]:
column_headers = list(trips.columns.values)
column_headers

In [None]:
a = df['bikeid'].unique()
a.sort()
a
trips['bikeid']=a
trips

In [None]:
trips_graph=trips.head(20)

In [None]:
trips_graph.plot.bar(x="bikeid", y="no_of_trips", rot=70, title="Number of trips")

In [None]:
trips_graph.plot.bar(x="bikeid", y="avg_duration", rot=70, title="BikeUsage",color="red")

In [None]:
trips_graph.plot.bar(x="bikeid", y="no_of_trips", rot=90, title="BikeUsage",color="black")

In [None]:
# testing for bikeid 14529
print(df[df['bikeid']==14529].shape)
print(df[df['bikeid']==14529]['tripduration'].sum(axis=0)/data[data['bikeid']==14529].shape[0])

In [None]:
trips.head()

In [None]:
from datetime import datetime, date

age=2013-df['birth_year']
df['Age']=age
df.head()

In [None]:
agegroup= pd.DataFrame()
df['Age']

In [None]:
max_limit = df['Age'].max()
max_limit

In [None]:
bins = [0,20,40,60,max_limit]

In [None]:
agegroup = pd.cut(df['Age'], bins=bins).value_counts()
agegroup

In [None]:
agegroup.plot.pie(autopct="%.1f%%",title='age group differentiation',counterclock=False);

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df, x="starttime" ) ;

In [None]:
trips['starttime']=df['starttime'].head(20)