In [None]:
'''In this NoteBook we will try to visualize the Uber Company's Dataset for Trips using DataViz tools like Plotly for
effective and intercative Ploting'''

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
df=pd.read_csv('UberDataset.csv')

In [3]:
df.head(5)

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [4]:
df.tail(5)

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


In [5]:
df.shape

(1156, 7)

In [6]:
df.columns

Index(['START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES',
       'PURPOSE'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   START_DATE  1156 non-null   object 
 1   END_DATE    1155 non-null   object 
 2   CATEGORY    1155 non-null   object 
 3   START       1155 non-null   object 
 4   STOP        1155 non-null   object 
 5   MILES       1156 non-null   float64
 6   PURPOSE     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


In [8]:
df.isnull().sum()

START_DATE      0
END_DATE        1
CATEGORY        1
START           1
STOP            1
MILES           0
PURPOSE       503
dtype: int64

In [14]:
df['PURPOSE'] = df['PURPOSE'].fillna('Unknown')
df.dropna(inplace=True)

In [15]:
df.isna().sum()

START_DATE    0
END_DATE      0
CATEGORY      0
START         0
STOP          0
MILES         0
PURPOSE       0
dtype: int64

In [16]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [19]:
#Exploratory Data Analysis
# Visualize trip purposes
purpose_counts = df['PURPOSE'].value_counts()
fig = px.bar(x=purpose_counts.index, y=purpose_counts.values,
             labels={'x': 'Purpose', 'y': 'Count'}, title='Trip Purposes')
fig.update_layout(xaxis_tickangle=-50)
fig.show()

In [20]:
# Visualize trip distances
fig = px.histogram(df, x='MILES', nbins=30, title='Trip Distances')
fig.update_xaxes(title='Miles')
fig.update_yaxes(title='Count')
fig.show()

In [21]:
#Feature Engineering
from dateutil.parser import parse
df['START_DATE'] = df['START_DATE'].apply(lambda x: parse(x))
df['start_day'] = df['START_DATE'].dt.strftime('%A')

In [22]:
df['start_day']

0          Friday
1        Saturday
2        Saturday
3         Tuesday
4       Wednesday
          ...    
1150     Saturday
1151     Saturday
1152     Saturday
1153     Saturday
1154     Saturday
Name: start_day, Length: 1154, dtype: object

In [24]:
# plottin the number of trips per each day
day_counts = df['start_day'].value_counts().reset_index()
day_counts.columns = ['Day_of_Week', 'Count']
colors = px.colors.qualitative.Plotly[:10]
fig = px.bar(day_counts, x='Day_of_Week', y='Count',
             color_discrete_sequence=colors,
             labels={'x': 'Day of the Week', 'y': 'Number of Trips'},
             title='Distribution of Trips by Day of the Week')
fig.show()

In [25]:
avg_distance_by_purpose = df.groupby('PURPOSE')['MILES'].mean().reset_index()
avg_distance_by_purpose.columns = ['Purpose', 'Average Miles']

In [26]:
# Create a bar chart using Plotly
fig = px.bar(avg_distance_by_purpose, x='Purpose', y='Average Miles',
             labels={'x': 'Purpose', 'y': 'Average Miles'},
             title='Average Distance Traveled by Purpose')

fig.show()