# Scooter Project

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sqlalchemy import create_engine, text

In [None]:
database_name = 'scooters'   

connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"

In [None]:
connection_string

In [None]:
engine = create_engine(connection_string)

In [None]:
query = '''
SELECT *
FROM scooters
LIMIT 3000000;
'''

with engine.connect() as connection: result = connection.execute(text(query))

In [None]:
with engine.connect() as connection:    
    scooters = pd.read_sql(text(query), con = connection)

In [None]:
scooters.head()

In [None]:
scooters.shape

In [None]:
database_name = 'trips'   

connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"

In [None]:
connection_string

In [None]:
query = '''
SELECT *
FROM trips;
'''

with engine.connect() as connection: result = connection.execute(text(query))

In [None]:
with engine.connect() as connection:    
    trips = pd.read_sql(text(query), con = connection)

In [None]:
trips.head()

In [None]:
trips.shape

#  Data cleaning

checking null values in scooters and trips tables.

In [None]:
scooters.isna().sum()

 Dealing with the missing data, I use dropna() function

In [None]:
scooters.chargelevel.dropna()

Checking datatypes of data in both tables

In [None]:
#Investigate any values that seem odd.

In [None]:
scooters.info()

In [None]:
trips.info()

    In trip table, startdate and enddate columns needs to be changed to datetime format

In [None]:
startdate = pd.to_datetime(trips['startdate'])
startdate

In [None]:
enddate = pd.to_datetime(trips['enddate'])
enddate

In [None]:
#checking dtypes
trips.dtypes

In [None]:
# Range of each  date columns

In [None]:
scooters['pubdatetime'].describe()

    pubdatetime range is from 2019-07-25 04:19:43 to 2019-07-29 02:38:39.347000

In [None]:
# trips table date time range

In [None]:
trips['pubtimestamp'].describe()

pubtimestamp range is from 2019-05-01 00:00:55.423000 to 2019-08-01 07:04:00

In [None]:
trips['create_dt'].describe()

Create_dt column from 2019-05-02 05:30:23.780000 to 2019-08-02 11:30:29.923000

In [None]:
# we can use min() and max() function also

In [None]:
# It is a 24 hour format.

In [None]:
#trips['starttime'] = trips['starttime'].dt.strftime(strftime('%H:%M:%S.%f'))
#trips['starttime']

In [None]:
# What values are there in the sumdgroup column?

    Values in the sumdgroup column are:

In [None]:
scooters.sumdgroup.value_counts()

In [None]:
# Are there any that are not of interest for this project? 

    Yes, 986 have interest on bicyle

In [None]:
# uniformity b/n scooters

In [None]:
scooters.loc[scooters['sumdgroup'] != 'Scooter']

In [None]:
# uniform by making first letter of scooter 's' captial 

In [None]:
scooters['sumdgroup'] = scooters['sumdgroup'].str.replace('scooter','Scooter')
scooters['sumdgroup']

In [None]:
# perentage for this project

In [None]:
scooters['sumdgroup'].value_counts(normalize = True)

In [None]:
# What are the minimum and maximum values for all the latitude and longitude columns? 

In [None]:
scooters.describe()

In [None]:
# scooters['latitude'].min() = 0.000000e+00

In [None]:
# scooters['latitude'].max()= 3.629644e+01

In [None]:
# scooters['longitude'].min()= -9.744388e+01

In [None]:
#scooters['longitude'].max() =0.000000e+00

In [None]:
# Do these ranges make sense, or is there anything surprising?

It doesn't make sense because the longitude values cannot execced below -90 degree.

In [None]:
# What is the range of values for trip duration and trip distance? 
# Do these values make sense? Explore values that might seem questionable.

In [None]:
trips['tripduration'].describe()

In [None]:
trips['tripdistance'].describe()

Exploring values: seems like there are some outliers data on both columns. 

In [None]:
sns.boxplot(data = trips,
            x = 'tripdistance');

In [None]:
sns.boxplot(data = trips,
            x = 'tripduration');

In [None]:
# from both box plot there is an outliers 

In [None]:
x = trips['tripdistance'].to_frame()
x

In [None]:
x.reset_index(drop = True).sort_values(by = 'tripdistance',ascending =False)

In [None]:
# Check out how the values for the company name column in the scooters table compare to those of the trips table.
# What do you notice?

In [None]:
scooters['companyname'].value_counts()

In [None]:
scooters['companyname'].unique()

In [None]:
trips['companyname'].value_counts()

In [None]:
trips['companyname'].unique()

In [None]:
# I noticed Bolt Mobility is only in trips 
# SPIN AND JUMP are in capitial letters as wel


Exploratory Analysis

In [None]:
# Joining both_tables

In [None]:
query = '''
SELECT *
FROM scooters inner join trips using (sumdid)
LIMIT 100000;
'''

with engine.connect() as connection: result = connection.execute(text(query))


In [None]:
with engine.connect() as connection:    
   both_tables = pd.read_sql(text(query), con = connection)

In [None]:
both_tables.head()

In [None]:
both_tables.shape

In [None]:
# looking lime company,
# maxium tripdistance per scooter per day, sumdgroup, count of sumdgroup

In [None]:
query = '''
SELECT tripdistance, max(tripdistance) as max_distance, sumdgroup, count(sumdgroup) as number_sumdgroup
FROM scooters inner join trips using (sumdid)
WHERE scooters.companyname = 'Lime'
   AND trips.companyname= 'Lime'
GROUP BY tripdistance, sumdgroup
LIMIT 100000;
'''

with engine.connect() as connection: result = connection.execute(text(query))


In [None]:
with engine.connect() as connection:    
   both_tables = pd.read_sql(text(query), con = connection)

In [None]:
both_tables.shape

In [None]:
# 