In [None]:
#import statements
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

#load data sets into dataframes
lyft_df = pd.read_csv('Data_files/lyft.csv')
rideshare_df = pd.read_csv('Data_files/rideshare_data.csv')
uber_df = pd.read_csv('Data_files/uber.csv')

In [None]:
#check headers for variable names
rideshare_df.head()

In [None]:
#Adam's code
#There were a couple outliers in days around Christmas with a small fraction of total traffic that skewed data.
#remove columns 4,9,& 10 from data sets as those days were outliers with very low ride volume.
booleans = []
for day in rideshare_df.day:
    if day < 4:
        booleans.append(True)
    elif day > 10:
        booleans.append(True)
    else:
        booleans.append(False)
day_check = pd.Series(booleans)
rideshare_adj = rideshare_df[day_check]

In [None]:
#separating lyft and uber and creating stacked chart showing volumes.
uber_values = rideshare_adj['cab_type'] == 'Uber'
uber_days = rideshare_adj[uber_values]
lyft_values = rideshare_adj['cab_type'] == 'Lyft'
lyft_days = rideshare_adj[lyft_values]
#grouping uber by day and plotting
uber_days_grouped = uber_days.groupby(['day'])
uber_day_count = uber_days_grouped.count() / 1000
uber_day_count['id'].plot(kind = 'bar', alpha = 0.75, align = 'center', color = 'r')
#grouping lyft by day and plotting
lyft_days_grouped = lyft_days.groupby(['day'])
lyft_day_count = lyft_days_grouped.count() / 1000
lyft_day_count['id'].plot(kind = 'bar', alpha = 0.75, align = 'center', color = 'b')
#labeling graph
plt.xlabel('Day of Month')
plt.ylabel('Number of Rides Per 1000')
plt.title('Bar Chart of the Number of Rides Given for Lyft and Uber on Each Day of Month')
plt.tight_layout()
plt.legend(['Uber', 'Lyft'])

In [None]:
# as uber maintained a constant lead in volume, I wanted to check if there was a relation between total volume
# and market shareduring those periods.
#use line graph to show values and % change during days.
ride_percent = (uber_day_count - lyft_day_count) / lyft_day_count * 100
#lines = df.plot.line()
lyft_line = lyft_day_count['id'].plot.line()
uber_line = uber_day_count['id'].plot.line()
ride_percent_line = ride_percent['id'].plot.line()
plt.xlabel('Day of Month')
plt.ylabel('Percent of Marker Uber over Lyft')
plt.title('Line Chart Relating Uber/Lyft Traffic Volume and Days')
plt.legend(['Lyft', 'Uber', 'Percentage'])

In [None]:
#chi-squared test on whether some days see higher change in ride share traffic than others
#create dataframe of the observed values for each drop off point
chi_all_ride_data = pd.DataFrame({'all days count':uber_day_count['id']})
#create variables that will that will calculate expected values, assuming there is no difference between drop off points
day_total = len(chi_all_ride_data)
total_rides = chi_all_ride_data['all days count'].sum()
#Add expected values to dataframe
chi_all_ride_data['Expected ride amount'] = total_rides / day_total
#chi-squared test
stats.chisquare(chi_all_ride_data['all days count'], chi_all_ride_data['Expected ride amount'])

In [None]:
#Spencer's code 
#group by destination and get counts to create a bar graph that will show what frequently each destination in used.
grouped_destination = rideshare_df.groupby(['destination'])
dest_count = grouped_destination.count()
#simple bar graph created with title and labels
dest_count['id'].plot(kind = 'bar', alpha = 0.75, align = 'center', color = 'k')
plt.xlabel('Drop off Point')
plt.ylabel('Number of Rides')
plt.title('Number of Rides to Different \n Areas of Boston')
plt.savefig('destination.png')

In [None]:
#chi-squared test on whether some destinations see more ride share traffic than others
#create dataframe of the observed values for each drop off point
chi_full_data = pd.DataFrame({'observed ride count':dest_count['id']})
#create variables that will that will calculate expected values, assuming there is no difference between drop off points 
dest_total = len(chi_full_data)
total_trips = chi_full_data['observed ride count'].sum()
#Add expected values to dataframe
chi_full_data['Expected ride count'] = total_trips / dest_total
#run chi-squared test
stats.chisquare(chi_full_data['observed ride count'], chi_full_data['Expected ride count'])

In [None]:
#create a a dataframe with the total count uber and lyft drop off points
lyft_count = lyft_df.groupby(['destination']).count()
uber_count = uber_df.groupby(['destination']).count()
merge_count = pd.DataFrame({'Uber':uber_count['id'], 'Lyft':lyft_count['id']})
#created stack graph to show how many dropoffs each service complete in each destination
merge_count.plot(kind='bar', stacked =True, color = ['k','r'], alpha = .75)
plt.xlabel('Drop off point')
plt.ylabel('Number of Rides')
plt.title('Stacked Bar Chart of the Number of rides to each \n Drop off point for Lyft and Uber')
plt.tight_layout()
plt.savefig('stacked_destination.png')

In [None]:
#run chi_squared test to see if the difference between services is expected or not
#calculate observed difference in drop offs by each service in each destination
merge_count['observed difference'] = merge_count['Uber']-merge_count['Lyft']
#calculate expected difference between the two services in each area if destination has no affect
difference_total = merge_count['observed difference'].sum()
expected = difference_total / dest_total
merge_count['expected difference'] = expected
#run chi-squared test
stats.chisquare(merge_count['observed difference'], merge_count['expected difference'])

In [None]:
#Rory's code
rideshare_df.head()

In [None]:
# Final DataFrame
final_uber_lyft_df = rideshare_df[["id", "cab_type", "name", "price", "distance"]]
final_uber_lyft_df.head()

In [None]:
# Sort DataFrame
#final_uber_lyft_df.sort_values(['cab_type','name'], ascending=True)
final_uber_lyft_df.sort_values(['cab_type','name'], ascending=[1,0])

In [None]:
final_uber_lyft_df = final_uber_lyft_df.loc[  (final_uber_lyft_df['name'] != "Shared") 
                       & (final_uber_lyft_df['name'] != "Taxi")
                       & (final_uber_lyft_df['name'] != "Lyft") 
                       & (final_uber_lyft_df['name'] != "UberPool")
                       & (final_uber_lyft_df['name'] != "WAV")]

In [None]:
grouped_df = final_uber_lyft_df.groupby(['name']).mean()

In [None]:
width = 0.25
grouped_df['price'].plot(kind = 'bar', color = 'k', width=width)


In [None]:
x = final_uber_lyft_df['name']
y = final_uber_lyft_df['price']
grouped_df.plot(kind='bar', align='center', color=['b','r'])
plt.xlabel('Specific Cab Type/Name')
plt.ylabel('Price')
plt.title('Uber-Lyft data Lux-Black-XL only')


In [None]:
data2 = {"Uber X":10, "Lyft Lux":18,"Uber Black":20, "Lyft Lux Black":23,
         "Uber XL":15, "Lyft XL":15,"Uber SUV":30, "Lyft Lux Black XL":31}
s2 = pd.Series(data2)

In [None]:
s2.plot.bar()