In [1]:
#Load the CSV files  - import the Pandas and Matplotlib library with the Pyplot Module and run the cell:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#declare variables that connect to the CSV files in the Resources folder
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [3]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [4]:
#Read the Ride Data file
# Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091
7,Simpsonburgh,2019-04-26 00:43:24,35.98,111953927754
8,South Karenland,2019-01-08 03:28:48,35.09,7995623208694
9,North Jasmine,2019-03-09 06:26:29,42.81,5327642267789


In [5]:
#Inspecting the City Data Frame involves the below
#When we inspect data such as the city data frame we must consider:
#Get all the rows that contain null values
#Make sure the driver_count column has an integer data type
#Find out how many data points there are for each type of city

In [6]:
#Use the df.count() to find the names of our columns and the numbers of rows that are not null
# Get the columns and the rows that are not null.
city_data_df.count()

city            120
driver_count    120
type            120
dtype: int64

In [7]:
#make sure that there are no null values
# Get the columns and the rows that are not null.
city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [8]:
#See if the driver_count column has a numerical data type as we plan to perform math calculations on this column
#To get the data types of each column, we use the dtypes on the DataFrame
# Get the data types of each column.
city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [9]:
#Check how many data points there are for each type of city using sum()
#We can use unique() on a specific column which will return an array or list, of all the unique values of the column
# Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [10]:
#We can use the sum() method on the city_data_df for the type of column where the conditions equal either Urban, Suburban and Rural
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")

66

In [11]:
#Get the number of data points from the Suburban cities
sum(city_data_df["type"]=="Suburban")

36

In [12]:
#Get the number of data points from the Rural cities
sum(city_data_df["type"]=="Rural")

18

In [13]:
#Inspect the Ride DataFrame
#Get all the rows that contain null values
#Make sure the fare and ride_id columns are numerical data types
# Get the columns and the rows that are not null.
ride_data_df.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [14]:
#Make sure there are no null values
#Get the columns and the rows that are not null.
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [15]:
#Determin if the fare and ride_id columns are numerical data types so that we can perform math calculations on these columns
# Get the data types of each column.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [16]:
#The 2 Dataframes have 'city' as a similar column between the 2 dataframes
#Merge the two Dataframes on the city column, then add the city_data_df to the end of the ride_data_df
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the DataFrame
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


In [17]:
#Get the average fare using mean() for each "fare" column for the y-axis
avg_fare = pyber_data_df["fare"].mean()
avg_fare

26.753111578947426

In [18]:
#Get the total number of rides for each type city on the x-axis
city_rides = pyber_data_df["ride_id"].count()
city_rides

2375

In [19]:
#DataFrame 1 - Urban
urban_cities_df = pyber_data_df[pyber_data_df["type"] == "Urban"]
urban_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


In [20]:
#DataFrame 2 - 

In [21]:
#Get the total number of Urban rides - DataFrame
ride_type_urban = pyber_data_df.loc[(pyber_data_df["type"] == "Urban") & pyber_data_df["ride_id"]]
ride_type_urban

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437,10,Urban
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091,44,Urban
...,...,...,...,...,...,...
1616,Simpsonburgh,2019-03-03 08:45:54,7.63,4176780124147,21,Urban
1617,West Angela,2019-05-07 01:03:12,12.35,2050512349119,48,Urban
1621,Christopherfurt,2019-03-13 01:47:52,32.05,2788817929605,41,Urban
1622,Raymondhaven,2019-03-20 02:20:34,38.53,8658255136559,11,Urban


In [22]:
#Get the total number value of urban rides 
ride_type_urban_count = len(ride_type_urban["ride_id"].unique())
ride_type_urban_count = ride_type_urban["ride_id"].count()
ride_type_urban_count

829

In [23]:
#Get the total number of Suburban rides (the woopwoop) - DataFrame
ride_type_suburban = pyber_data_df.loc[(pyber_data_df["type"] == "Suburban") & pyber_data_df["ride_id"]]
ride_type_suburban

Unnamed: 0,city,date,fare,ride_id,driver_count,type
1625,Barronchester,2019-01-27 03:08:01,27.79,6653622887913,11,Suburban
1627,Lake Omar,2019-01-17 21:33:35,21.71,966911700371,22,Suburban
1629,West Hannah,2019-04-19 01:06:59,37.78,2273047151891,12,Suburban
1633,Josephside,2019-03-30 09:45:11,44.74,5592692102803,25,Suburban
1634,Brandonfort,2019-01-14 12:35:27,28.02,8100258078935,10,Suburban
...,...,...,...,...,...,...
2238,South Teresa,2019-03-05 19:18:37,32.29,4981692995989,21,Suburban
2241,Sotoville,2019-03-13 16:06:10,14.36,8629533024103,10,Suburban
2242,West Hannah,2019-01-21 19:54:08,41.95,16507766547,12,Suburban
2243,South Teresa,2019-03-09 09:54:33,25.08,8136313906059,21,Suburban


In [24]:
ride_type_suburban_count = len(ride_type_suburban["ride_id"].unique())
ride_type_suburban_count = ride_type_suburban["ride_id"].count()
ride_type_suburban_count

312

In [25]:
#Get the total number of Suburban rides (the woopwoop) - DataFrame
ride_type_rural = pyber_data_df.loc[(pyber_data_df["type"] == "Rural") & pyber_data_df["ride_id"]]
ride_type_rural

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2253,Lake Latoyabury,2019-02-23 21:12:24,47.90,3269652929887,2,Rural
2255,Taylorhaven,2019-04-06 01:01:29,45.31,2275364435623,1,Rural
2256,Garzaport,2019-02-18 18:27:48,36.16,2928337561347,7,Rural
2257,New Ryantown,2019-01-27 17:33:41,42.68,7994603753131,2,Rural
...,...,...,...,...,...,...
2363,Michaelberg,2019-03-13 14:40:18,37.72,8842606115175,6,Rural
2364,Lake Jamie,2019-02-25 23:46:02,28.65,5815763839331,4,Rural
2365,Lake Jamie,2019-04-19 04:32:47,16.29,3518682119233,4,Rural
2368,Lake Jamie,2019-04-29 01:58:44,54.22,2489264790267,4,Rural


In [27]:
ride_type_rural_count = len(ride_type_rural["ride_id"].unique())
ride_type_rural_count = ride_type_rural["ride_id"].count()
ride_type_rural_count

61

In [28]:
ride_type_rural.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2253,Lake Latoyabury,2019-02-23 21:12:24,47.9,3269652929887,2,Rural
2255,Taylorhaven,2019-04-06 01:01:29,45.31,2275364435623,1,Rural
2256,Garzaport,2019-02-18 18:27:48,36.16,2928337561347,7,Rural
2257,New Ryantown,2019-01-27 17:33:41,42.68,7994603753131,2,Rural
