<a href="https://colab.research.google.com/github/ChrisBarsolai/autolib-data/blob/master/AutoLib_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing our Modules

In [None]:
# Importing pandas
#
import pandas as pd

# Importing our CSV files and creating our dataset.

In [None]:
url = "./dataset/autolib.csv"
df = pd.read_csv(url)
df.head()

# View our data info

In [None]:
# Getting information from our dataset.
#
df.info()

In [None]:
# Describing our data allows us to get a general overview of how the data is.
#
df.describe()

# Data Preparation/Cleaning


># 1.1 Validity

In [None]:
# Procedure 1:
# Data Cleaning Action: Droping the irrelevant Scheduled at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Scheduled at'], axis=1, inplace=True)
df_new

In [None]:
# Procedure 2:
# Data Cleaning Action: Droping the irrelevant Subscription status at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Subscription status'], axis=1, inplace=True)
df_new

In [None]:
# Procedure 3:
# Data Cleaning Action: Droping the irrelevant Subscription status at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Status'], axis=1, inplace=True)
df_new

In [None]:
# Procedure 3:
# Data Cleaning Action: Droping the Displayed comment column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Displayed comment'], axis=1, inplace=True)
df_new

># 1.2 Accuracy

In [None]:
# Procedure 1:
# Data Cleaning Action: Checking for null values in our dataset.
# Explanation: This is to allows us to determin the columns that have null values.
# 
df_null = df.isnull().any()
df_null

># 1.3 Completeness

In [None]:
# Procedure 1:
# Data Cleaning Action: Check for duplicate values.
# Explanation: This is to allow us determine data that is duplicated.
#
df_dup = df.duplicated().sum()
df_dup

># 1.4 Consistency

In [None]:
# Procedure 1:
# Data Cleaning Action: Dropping null values if they exist.
# Explanation: This is to allow us determine data that is duplicated.
#
df_new = df.dropna(how="all")
df_new.head()

># 1.5 Uniformity

In [None]:
# Procedure 1:
# Data Cleaning Action: Converting column names to lowercase characters.
# Explanation: This is to allow for a uniform columns in the dataframe.
#
df.columns = df.columns.str.rstrip().str.lower()
df.head(3)

In [None]:
# Procedure 2:
# Data Cleaning Action: Converting row values to lowercase characters.
# Explanation: This is to allow for rows to have uniform values which are in lowercase in the dataframe.
#
df = df.astype(str).apply(lambda x: x.str.lower())
df.head(3)

In [None]:
df.columns = df.columns.str.replace(" ", "_", regex=True)
df.head(2)

In [None]:
# Splitting the geopoint column into two.
#


# Exporting our clean csv file.

In [None]:
df.to_csv("autolib_clean.csv")

# Answering Questions


#  *Using Blue cars*

> # *Challenge 1*
> 1. What is the most popular hour for returning cars?

In [None]:
# Challenge 1
# Finding the most popular hour for returning cars?.
#
url = "autolib_clean.csv"
df = pd.read_csv(url)
df_x = df[["bluecar_counter","day", "hour", "minute"]]
df_car = df_x[df_x["bluecar_counter"] == 0]
df_car['hour'].max()

> # *Challenge 2*
>  What station is the most popular?
      > * Overall?

In [None]:
# Challenge 2
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['bluecar_counter'].max().sort_values(ascending=False)

df_p.head(1)

> # *Challenge 3*
> 3. What station is the most popular?
      > * At the most popular picking hour?

In [None]:
# Challenge 3
# Finding the most popular station (At the most popular picking hour) from our dataset.
#
df_p = df.groupby(['address'])['hour'].max().sort_values(ascending=False)

df_p.head(1)

> # *Challenge 4*
  > 4. What postal code is the most popular for picking up Blue cars? Does the most popular station belong to that postal code?
   > * Overall?

In [None]:
# Challenge 4
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'bluecar_counter']]
df_postal = df_station.sort_values('bluecar_counter', ascending=False)
df_postal.head(1)

 > # *Challenge 5*
   > * At the most popular picking hour?

In [None]:
# Challenge 5
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'hour']]
df_postal = df_station.sort_values('hour', ascending=False)
df_postal.head(1)

> # *Challenge 6*
> 5. Sorting the non-existent, operational and broken charging status in Descending Order.


In [None]:
# Challenge 4
# Finding the What charging status has the highest value for picking up Blue cars from our dataset.
#
df_city = df.groupby(['charging_status'])['bluecar_counter'].sum().sort_values(ascending=False)
df_city.head(16)

> # Using Utilib counter and Utilib 1.4 counter

> # *Challenge 1*
>  What station is the most popular?
      > * Overall?

> a). utilib_counter

In [None]:
# Challenge 1.1
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['utilib_counter'].max().sort_values(ascending=False)
df_p.head(1)

> b). utilib_1.4_counter

In [None]:
# Challenge 1.2
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['utilib_1.4_counter'].max().sort_values(ascending=False)
df_p.head(1)

> # *Challenge 2*
  > 2. What postal code is the most popular for picking up Utilib_counter and Utilib_1.4_counter? Does the most popular station belong to that postal code?
   > * Overall?

> b). utilib_1.4_counter

In [None]:
# Challenge 2.1
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'utilib_counter']]
df_postal = df_station.sort_values('utilib_counter', ascending=False)
df_postal.head(1)

> b). utilib_1.4_counter

In [None]:
# Challenge 2.2
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'utilib_1.4_counter']]
df_postal = df_station.sort_values('utilib_1.4_counter', ascending=False)
df_postal.head(1)

> # *Challenge 3*
> 5. Sorting the non-existent, operational and broken charging status in Descending Order.


> b). utilib_1.4_counter

In [None]:
# Challenge 3.1
# Finding the What charging status has the highest value for picking up utilib cars from our dataset.
#
df_city = df.groupby(['charging_status'])['utilib_counter'].sum().sort_values(ascending=False)
df_city

> b). utilib_1.4_counter

In [None]:
# Challenge 4
# Finding the What charging status has the highest value for picking up Blue cars from our dataset.
#
df_city = df.groupby(['charging_status'])['utilib_1.4_counter'].sum().sort_values(ascending=False)
df_city

# Research Question

> * Identify the most popular hour of the day for picking up a shared electric car (Bluecar) in the city of Paris over the month of April 2018.*

In [None]:
df_p = df[['city', 'bluecar_counter', 'hour']]
df_paris = df[df['city'] == 'paris']
df_p = df_paris.groupby(['bluecar_counter'])['hour'].max().sort_values(ascending=False)
df_p