In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl

In [2]:
#Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [3]:
city_data_df = pd.read_csv(city_data_to_load)
ride_data_df = pd.read_csv(ride_data_to_load)

In [4]:
# Get the columns and the rows that are not null.
city_data_df.count()

city            120
driver_count    120
type            120
dtype: int64

In [5]:
# Get the columns and the rows that are not null.
city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [6]:
# Get the data types of each column.
city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [7]:
# Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [8]:
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")

66

In [9]:
# Get the number of data points from the Suburban cities
sum(city_data_df["type"]=="Suburban")

36

In [10]:
# Get the number of data points from the Rural cities
sum(city_data_df["type"]=="Rural")

18

In [11]:
# Get the columns and the rows that are not null.
ride_data_df.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [12]:
# Get the columns and the rows that are not null.
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [13]:
# Get the data types of each column.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [14]:
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how = "left", on = ["city","city"])
#Display the DF 
pyber_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437,10,Urban
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091,44,Urban
7,Simpsonburgh,2019-04-26 00:43:24,35.98,111953927754,21,Urban
8,South Karenland,2019-01-08 03:28:48,35.09,7995623208694,4,Urban
9,North Jasmine,2019-03-09 06:26:29,42.81,5327642267789,33,Urban


In [15]:
# Create the urban city DataFrame. 
urban_cities_df = pyber_data_df[pyber_data_df["type"]=="Urban"]
urban_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


In [16]:
# Create the rural city Data Frame
rural_cities_df = pyber_data_df[pyber_data_df["type"]=="Rural"]
rural_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2251,North Holly,2019-02-02 14:54:00,12.42,1985256326182,8,Rural
2252,Michaelberg,2019-03-27 18:27:34,54.85,4421836952718,6,Rural
2253,Lake Latoyabury,2019-02-23 21:12:24,47.9,3269652929887,2,Rural
2254,Lake Latoyabury,2019-05-06 08:57:56,51.8,4018025271936,2,Rural


In [17]:
# Create the suburban df
suburban_cities_df = pyber_data_df[pyber_data_df["type"]=="Suburban"]
suburban_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
1625,Barronchester,2019-01-27 03:08:01,27.79,6653622887913,11,Suburban
1626,East Kentstad,2019-04-07 19:44:19,18.75,6575961095852,20,Suburban
1627,Lake Omar,2019-01-17 21:33:35,21.71,966911700371,22,Suburban
1628,Myersshire,2019-02-27 17:38:39,17.1,5706770909868,19,Suburban
1629,West Hannah,2019-04-19 01:06:59,37.78,2273047151891,12,Suburban


In [18]:
# Get the number of rides for urban cities.
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count

city
Amandaburgh            18
Barajasview            22
Carriemouth            27
Christopherfurt        27
Deanville              19
                       ..
West Patrickchester    16
West Robert            31
West Samuelburgh       25
Williamsstad           23
Williamsview           20
Name: ride_id, Length: 66, dtype: int64

In [19]:
suburban_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]

#Get the average fare for each city 
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
suburban_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]

In [20]:
#Get the average number of drivers for each city type
urban_driver_count = urban_cities_df.groupby(["city"]).mean()["driver_count"]
suburban_driver_count = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
rural_driver_count = rural_cities_df.groupby(["city"]).mean()["driver_count"]

In [21]:
# Build the scatter plots for urban cities.
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=10*urban_driver_count, c="coral",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

NameError: name 'urban_avg_fare' is not defined

In [None]:
plt.scatter(suburban_ride_count,
           suburban_avg_fare,
           s=10*suburban_driver_count,
           edgecolor="black", linewidth=1,
           alpha=0.8, label="Suburban",c="skyblue")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
plt.legend()

In [None]:
plt.scatter(rural_ride_count,
           rural_avg_fare,
           s=10*rural_driver_count,
           edgecolor="black",c="gold",
           linewidths=1, alpha=0.8, label = "Rural")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
plt.legend()

In [None]:
# Add the scatter charts for each type of city.
plt.subplots(figsize=(10, 6))
plt.scatter(urban_ride_count,
      urban_avg_fare,
      s=10*urban_driver_count, c="coral",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")

plt.scatter(suburban_ride_count,
      suburban_avg_fare,
      s=10*suburban_driver_count, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")

# Incorporate the other graph properties
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.ylabel("Average Fare ($)", fontsize=12)
plt.xlabel("Total Number of Rides (Per City)", fontsize=12)
plt.grid(True)

# Add the legend.
lgnd = plt.legend(fontsize="12", mode="Expanded", 
                 scatterpoints=1, loc="best", title = "City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)

# Incorporate a text label about circle size. 
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")

#Save the figure
plt.savefig("analysis/Fig1.png")

# Show the plot
plt.show()


In [None]:
urban_cities_df.describe()

In [None]:
suburban_cities_df.describe()

In [None]:
rural_cities_df.describe()

In [None]:
urban_ride_count.describe()

In [None]:
suburban_ride_count.describe()

In [None]:
rural_ride_count.describe()

In [None]:
#Calculate the mean of the ride amount for each city type
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

In [None]:
round(urban_ride_count.median(),2)

In [None]:
#calculate the mode of the ride count for the urban cities
round(urban_ride_count.mode(), 2)

In [None]:
suburban_ride_count.mode()

In [None]:
# Import NumPy and the stats module from SciPy.
import numpy as np
import scipy.stats as sts

In [None]:
# Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for urban trips is {mean_urban_ride_count}")
      
median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for urban trip is {median_urban_ride_count}")
      
mode_urban_ride_count = sts.mode(urban_ride_count)
print(f"The mode for the trips for urban rides is {mode_urban_ride_count}")
      

In [None]:
mode_suburban_ride_count = sts.mode(suburban_ride_count)
print(f"The mode for suburban rides is {mode_suburban_ride_count}")

In [None]:
mode_rural_ride_count = sts.mode(rural_ride_count)
print(f"The mode for rural rides is {mode_rural_ride_count}")

In [None]:
# Get the fares for the urban cities. 
urban_fares = urban_cities_df["fare"]
urban_fares.head()

In [None]:
suburban_fares = suburban_cities_df["fare"]
suburban_fares.head()

In [None]:
rural_fares = rural_cities_df["fare"]
rural_fares.head()

In [None]:
#Calculate the measures of central tendency for the average fare for the urban cities 
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price in urban cities is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare for urban cities is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode for urban fares is {mode_urban_fares}")

In [None]:
#Calculate the measures of central tendency for the average fare for the suburban cities 
mean_suburban_fares = np.mean(suburban_fares)
print(f"The mean fare price in urban cities is ${mean_suburban_fares:.2f}.")

median_suburban_fares = np.median(suburban_fares)
print(f"The median fare for urban cities is ${median_suburban_fares:.2f}.")

mode_suburban_fares = sts.mode(suburban_fares)
print(f"The mode for urban fares is {mode_suburban_fares}")

In [None]:
#Calculate the measures of central tendency for the average fare for the rural cities 
mean_rural_fares = np.mean(rural_fares)
print(f"The mean fare price in urban cities is ${mean_rural_fares:.2f}.")

median_rural_fares = np.median(rural_fares)
print(f"The median fare for urban cities is ${median_rural_fares:.2f}.")

mode_rural_fares = sts.mode(rural_fares)
print(f"The mode for urban fares is {mode_rural_fares}")

In [None]:
#Get the driver count data from the urban cities. 
urban_drivers = urban_cities_df["driver_count"]
urban_drivers.head()

In [None]:
#Get the driver count data from the suburban cities. 
suburban_drivers = suburban_cities_df["driver_count"]
suburban_drivers.head()

In [None]:
#Get the driver count data from the rural cities. 
rural_drivers = rural_cities_df["driver_count"]
rural_drivers.head()

In [None]:
urban_drivers.describe()

In [None]:
suburban_drivers.describe()

In [None]:
rural_drivers.describe()

In [None]:
# Create a box-and-whisker plot for the urban cities ride count. 
x_lables =["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_ride_count, labels = x_lables)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(10,41, step=2.0))
ax.grid()
plt.show()

In [None]:
# Create a box-and-whisker plot for the suburban cities ride count
x_lables=["Suburban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_ride_count, labels = x_lables)
# Add the title, y-axis label and grid. 
ax.set_title("Ride Count Data(2019)")
ax.set_ylabel("Number of Rides")
ax.set_yticks(np.arange(10,41,step=2.0))
ax.grid()
plt.show()

In [None]:
# Create a box-and-whisker plot for the rural cities ride count
x_lables = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_ride_count, labels = x_lables)
# Add the title, y-axis label and grid
ax.set_title("Ride Count Data(2019)")
ax.set_ylabel("Number of Rides")
ax.set_yticks(np.arange(10,41, step=2.0))
ax.grid()
plt.show()

In [None]:
# Add all ride count box-and-whisker plots to the same graph.
x_labels = ["Urban", "Suburban","Rural"]
ride_count_data = [urban_ride_count, suburban_ride_count, rural_ride_count]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(ride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0, 45, step=3.0))
ax.grid()
# Save the figure.
plt.savefig("analysis/Fig2.png")
plt.show()

In [None]:
#To get an outlier we can use the following code
# Get the city that matches 39 (outlier)
urban_city_outlier = urban_ride_count[urban_ride_count == 39].index[0]
print(f"{urban_city_outlier} has the highest rider count.")

In [None]:
# Create a box-and-whisker plot for the urban fare data. 
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_fares, labels = x_lables)
# Add the title, y-axis, label and grid. 
ax.set_title("Ride Fare Data (2019)")
ax.set_ylabel("Fare($USD)")
ax.set_yticks(np.arange(0,51, step = 5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_fares.describe()

In [None]:
# Create a box-and-whisker plot for the suburban fare data 
x_labels = ["Suburban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_fares, labels = x_labels)
# Add the title, y - axis and lebel and grid. 
ax.set_title('Ride Fare Data (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0,51, step = 5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_fares.describe()

In [None]:
# Create a box-and-whisker plot for the rural fare data 
x = labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_fares, labels = x_labels)
# Add the title, y - axis and lebel and grid. 
ax.set_title('Ride Fare Data (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0,51, step = 5.0))
ax.grid()
plt.show()
print("Summary Statistics")
rural_fares.describe()

In [None]:
# Create a box-and-whisker plot for all city type fares. 
x_labels = ["Urban","Suburban","Rural"]
fig, ax = plt.subplots(figsize=(10,6))
fare_count_data = [urban_fares, suburban_fares, rural_fares]
ax.boxplot(fare_count_data, labels = x_labels )
# Add the title, y - axis and lebel and grid. 
ax.set_title('Ride Fare Data (2019)', fontsize=20)
ax.set_ylabel('Fare($USD)', fontsize=14)
ax.set_xlabel('City Types', fontsize=14)
ax.set_yticks(np.arange(0,55, step = 5.0))
ax.grid()
plt.savefig("analysis/Fig3.png")
plt.show()

In [None]:
# Create the box-and-whisker plot for the urban driver count data.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_drivers.describe()

In [None]:
# Create the box-and-whisker plot for the Suburban driver count data.
x_labels = ["Suburban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_drivers.describe()

In [None]:
# Create the box-and-whisker plot for the rural driver count data.
x_labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_drivers.describe()

In [None]:
# Create a box-and-whisker plot for all city type drivers.
x_labels = ["Urban","Suburban","Rural"]
fig, ax = plt.subplots(figsize=(10,6))
driver_count_data = [urban_drivers, suburban_drivers, rural_drivers]
ax.boxplot(driver_count_data, labels = x_labels )
# Add the title, y - axis and lebel and grid. 
ax.set_title('Driver Count Data (2019)', fontsize=20)
ax.set_ylabel('Driver Count', fontsize=14)
ax.set_xlabel('City Types', fontsize=14)
ax.set_yticks(np.arange(0,75, step = 5.0))
ax.grid()
plt.savefig("analysis/Fig4.png")
plt.show()

In [None]:
# Get the sum of the fares for each city type
sum_fares_by_type = pyber_data_df.groupby(["type"]).sum()["fare"]
sum_fares_by_type

In [None]:
# Get the sum of all the fares.
total_fares = pyber_data_df["fare"].sum()
total_fares

In [None]:
# Calculate the percentage of fare for each city type. 
type_percents = 100 * sum_fares_by_type / total_fares
type_percents

In [None]:
# Build the percentage of fares by city type pie chart. 
plt.subplots(figsize=(10,6))
plt.pie(type_percents, 
        labels=["Rural","Suburban","Urban"],
       colors=["gold","lightskyblue","lightcoral"],
       explode=[0,0,0.1],
       autopct="%1.1f%%",
       shadow=True, startangle=150)
plt.title("% of Total Fares by City Type")
mpl.rcParams["font.size"]=14
plt.savefig("analysis/Fig5.png")
plt.show()

In [None]:
# Calculate the percentage of rides for each city type.
ride_percents = 100 * pyber_data_df.groupby(["type"]).count()["ride_id"] / pyber_data_df["ride_id"].count()
ride_percents

In [None]:
#Build the percentage of rides by city type pie chart
plt.subplots(figsize = (10,6))
plt.pie(
    ride_percents,
    labels=["Rural","Urban", "Suburban"],
    colors=["gold","lightskyblue","lightcoral"],
    explode=[0,0,0.1],
    autopct="%1.1f%%",
    shadow = True, startangle=150)
plt.title("% of Total Rides by City type")
# Change the defualt font size
mpl.rcParams["font.size"] = 14
# Save figure
plt.savefig('analysis/Fig6.png')
plt.show()

In [None]:
driver_percents = 100 * city_data_df.groupby(["type"]).sum()["driver_count"] / city_data_df["driver_count"].sum()
driver_percents

In [None]:
#Build the percentage of rides by city type pie chart
plt.subplots(figsize = (10,6))
plt.pie(
    driver_percents,
    labels=["Rural","Urban", "Suburban"],
    colors=["gold","lightskyblue","lightcoral"],
    explode=[0,0,0.1],
    autopct="%1.1f%%",
    shadow = True, startangle=150)
plt.title("% of Total Drivers by City type")
# Change the defualt font size
mpl.rcParams["font.size"] = 14
# Save figure
plt.savefig('analysis/Fig7.png')
plt.show()