In [None]:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import os


In [None]:
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [None]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

In [None]:
# Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.head(10)

In [None]:
city_data_df.count()

In [None]:
# #Calculate the NULL Values
# city_data_df.isnull().sum()

In [None]:
#TO Get Data Types on the Data Frame
city_data_df.dtypes

In [None]:
#In order to get the unique value in the column we have to be sure to give the 
#column name as well
city_data_df["type"].unique()

In [None]:
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Rural")
sum(city_data_df["type"]=="Suburban")

In [None]:
#GEt all rows that contain null values
ride_data_df.isnull().sum()

In [None]:
ride_data_df.count()

In [None]:
#GEt ride data frame types
ride_data_df.dtypes

In [None]:
#SYNTAX TO MERGE 2 DATAFRAMES
#new_df = pd.merge(leftdf, rightdf, how="left","right"inner"or"outer", on=["column_leftdf", "column_rightdf"])

In [None]:
pyber_data_df = pd.merge(ride_data_df,city_data_df, how="left", on=["city","city"])
pyber_data_df

In [None]:
# Create the Urban city DataFrame.
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
urban_cities_df.head()

In [None]:
#Create Rural City Data Frames
rural_cities_df = pyber_data_df[pyber_data_df["type"] == "Rural"]
rural_cities_df

In [None]:
suburban_cities_df = pyber_data_df[pyber_data_df["type"] == "Suburban"]
suburban_cities_df

In [None]:
sub_ride_count = suburban_cities_df.groupby(["city"]).count()["ride_id"]
sub_ride_count.head()

In [None]:
# Get the number of rides for urban cities.
urban_ride_count = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ride_count.head()

In [None]:
rural_ride_count = rural_cities_df.groupby(["city"]).count()["ride_id"]
rural_ride_count.head()

In [None]:
# Get average fare for each city in the urban cities.
urban_avg_fare = urban_cities_df.groupby(["city"]).mean()["fare"]
urban_avg_fare.head()

In [None]:
rural_avg_fare = rural_cities_df.groupby(["city"]).mean()["fare"]
rural_avg_fare.head()

In [None]:
sub_avg_fare = suburban_cities_df.groupby(["city"]).mean()["fare"]
sub_avg_fare.head()

In [None]:
sub_driver_count=suburban_cities_df.groupby(["city"]).mean()["driver_count"]
sub_driver_count.head()

In [None]:
rural_driver_count=rural_cities_df.groupby(["city"]).mean()["driver_count"]
rural_driver_count.head()

In [None]:
#GEt Avg # of drivers per Rural City
urban_driver_count=urban_cities_df.groupby(["city"]).mean()["driver_count"]
urban_driver_count.head()

In [None]:
# Build the scatter plots for urban cities.
plt.xlabel("Ride Count")
plt.ylabel("Avg Fare ($)")
plt.title("Average Fare per Ride")
plt.legend("Bus Rider")
plt.legend()
plt.scatter(urban_ride_count, urban_avg_fare,marker="o",color="black",s = urban_driver_count)
plt.show()

In [None]:
# Build the scatter plots for urban cities.
plt.scatter(urban_ride_count,urban_avg_fare,s=10*urban_driver_count, c="coral",edgecolor="black", 
            linewidths=1,alpha=0.8, label="Urban")

#label provides the info for the Legend
#Linewidths effects the thickness of the line around the plots
#Alpha effects the transparency of the plots

plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)

# Add the legend.
plt.legend()
plt.show()

In [None]:
plt.scatter(sub_ride_count,sub_avg_fare ,color="blue",marker='o',label="Test",
           s=sub_driver_count*10,alpha=.5,edgecolor="black")

plt.ylabel("Average Fare ($)")
plt.xlabel("Total Rides per City")

plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Build the scatter plots for rural cities.
plt.scatter(rural_ride_count,
      rural_avg_fare,
      s=10*rural_driver_count, c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
plt.subplots(figsize=(8,6))

plt.scatter(rural_ride_count, rural_avg_fare,alpha=0.8,color="gold",s=rural_avg_fare*3,
           linewidths=1,edgecolor="black",label="Rural")

plt.scatter(urban_ride_count,urban_avg_fare,alpha=.7,color="coral",s=urban_driver_count*3,
           linewidths=1,edgecolor="black",label="Urban")

plt.scatter(sub_ride_count,sub_avg_fare,alpha=.8,color="skyblue",s=sub_driver_count*4,
           linewidths=1,edgecolor="black",label="Suburban")

plt.ylabel=("Avg Fare per Ride ($)")
plt.xlabel=("Number of Rides per Day")

# Incorporate the other graph properties
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
#plt.ylabel("Average Fare ($)", fontsize=12)
#plt.xlabel("Total Number of Rides (Per City)", fontsize=12)

# Create a legend
lgnd = plt.legend(fontsize="12", mode="Expanded",
         scatterpoints=1, loc="best", title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)


# Incorporate a text label about circle size.
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")

plt.grid(True)
plt.legend()
plt.savefig("analysis/Fig1.png")
plt.show()