In [26]:
# Add Matplotlib inline magic command
%matplotlib inline

In [27]:
# Dependencies and Setup

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import statistics

In [28]:
# CSV files (city_data)
PyBer_city_data = os.path.join('..', 'PyBer_Analysis', 'Resources', 'city_data.csv')
city_data_df = pd.read_csv(PyBer_city_data, encoding = "UTF-8")
city_data_df.head()

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban


In [29]:
# CSV files (city_data)
PyBer_ride_data = os.path.join('..', 'PyBer_Analysis', 'Resources', 'ride_data.csv')
ride_data_df = pd.read_csv(PyBer_ride_data, encoding = "UTF-8")
ride_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,14/01/2019 10:14,13.83,5739410935873
1,South Michelleport,04/03/2019 18:24,30.24,2343912425577
2,Port Samanthamouth,24/02/2019 04:29,33.44,2005065760003
3,Rodneyfort,10/02/2019 23:22,23.44,5149245426178
4,South Jack,06/03/2019 04:28,34.58,3908451377344
5,South Latoya,11/03/2019 12:26,9.52,1994999424437
6,New Paulville,27/02/2019 11:17,43.25,793208410091
7,Simpsonburgh,26/04/2019 00:43,35.98,111953927754
8,South Karenland,08/01/2019 03:28,35.09,7995623208694
9,North Jasmine,09/03/2019 06:26,42.81,5327642267789


# Inspecting the data in both files

In [30]:
## Inspect the city_data 

# Check for null values
city_data_df.isnull()
city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [31]:
# Check if driver_count column has an integer data type
city_data_df['driver_count']
city_data_df.count()

city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [32]:
# Find out how many data points there are for each type of city
type_city_count = city_data_df.groupby(by = "type").count()
type_city_count.loc[:,["driver_count"]].sort_values("driver_count")

Unnamed: 0_level_0,driver_count
type,Unnamed: 1_level_1
Rural,18
Suburban,36
Urban,66


In [33]:
## Inspect the ride_data 

# Get all the rows that contain null values
ride_data_df.isnull()
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [36]:
# Make sure the fare and ride_id columns are numerical data types
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int32
dtype: object

# Merge DataFrames

In [37]:
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the DataFrame
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,14/01/2019 10:14,13.83,1334628417,5,Urban
1,South Michelleport,04/03/2019 18:24,30.24,-1139718039,72,Urban
2,Port Samanthamouth,24/02/2019 04:29,33.44,-683967229,57,Urban
3,Rodneyfort,10/02/2019 23:22,23.44,-420361726,34,Urban
4,South Jack,06/03/2019 04:28,34.58,31137984,46,Urban
