# 1. Data Cleaning
### i.Checking for missing values
### ii.Checking for duplicates
### iii.Type conversion
### iv.Handling Outliers
### v.Consistency check

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Dataset_Uber Traffic.csv')

In [7]:
#Check for missing values
print(df.isnull().sum())

DateTime    0
Junction    0
Vehicles    0
ID          0
dtype: int64


In [9]:
#There are no missing values

In [10]:
#Checking for duplicate rows
duplicates = df.duplicated()
print(f'Number of duplicate rows: {duplicates.sum()}')

Number of duplicate rows: 0


In [11]:
#There are no duplicate rows

In [13]:
# Ensuring ift he columns have appropriate data types
print(df.dtypes)

DateTime    object
Junction     int64
Vehicles     int64
ID           int64
dtype: object


In [15]:
#Type conversion
# Converting 'DateTime' column to datetime type
df['DateTime'] = pd.to_datetime(df['DateTime'])

In [20]:
# Handling outliers using IQR method

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['Vehicles'].quantile(0.25)
Q3 = df['Vehicles'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
outliers = df[(df['Vehicles'] < lower_bound) | (df['Vehicles'] > upper_bound)]
print(f'Number of outliers: {outliers.shape[0]}')



Number of outliers: 3617


In [24]:
#Removing Outliers
df_clean = df[(df['Vehicles'] >= lower_bound) & (df['Vehicles'] <= upper_bound)]

In [25]:
# Verify if outliers are removed
print(f'Original data size: {df.shape[0]}')
print(f'Cleaned data size: {df_clean.shape[0]}')

Original data size: 44503
Cleaned data size: 44503


In [26]:
#No outliers detected

In [27]:
# Checking the range of dates
print(f'Start Date: {df["DateTime"].min()}')
print(f'End Date: {df["DateTime"].max()}')

# Check for consistency in 'Junction' and 'ID' columns
print(df['Junction'].unique())
print(df['ID'].unique())


Start Date: 2015-01-11 00:00:00
End Date: 2017-12-06 23:00:00
[1 2 3 4]
[20151101001 20151101011 20151101021 ... 20170630214 20170630224
 20170630234]


In [28]:
#Data is consistent and covers a time span from January 11, 2015, to December 6, 2017

### Data cleaning is done.
### The dataset was already of good qualitity
### There was little to be done

# 2. Aggregate traffic data
### - Compile traffic data into hourly intervals for each junction.
### - Ensure data includes relevant details such as vehicle counts.

In [30]:
#Resampling the data into hourly intervals with vehicle count 

In [29]:
# Seting 'DateTime' as the index for resampling
df.set_index('DateTime', inplace=True)

# Resampling the data to hourly intervals and aggregate vehicle counts
df_hourly = df.groupby('Junction').resample('H').agg({'Vehicles': 'sum'}).reset_index()

# Check the first few rows of the aggregated data
print(df_hourly.head())

   Junction            DateTime  Vehicles
0         1 2015-01-11 00:00:00        15
1         1 2015-01-11 01:00:00        13
2         1 2015-01-11 02:00:00        10
3         1 2015-01-11 03:00:00         7
4         1 2015-01-11 04:00:00         9


In [31]:
#Let's verify if there are any missing hours and data for a specific junction

In [32]:
# Check for missing hours
missing_hours = df_hourly[df_hourly['Vehicles'].isna()]
print(f'Missing hours: {missing_hours.shape[0]}')

Missing hours: 0


In [34]:
# Checking the data for a specific junction
junction_id = 4  # Change this to the junction of interest
df_junction_hourly = df_hourly[df_hourly['Junction'] == junction_id]
print(df_junction_hourly.head())

       Junction            DateTime  Vehicles
76377         4 2017-01-01 00:00:00         3
76378         4 2017-01-01 01:00:00         1
76379         4 2017-01-01 02:00:00         4
76380         4 2017-01-01 03:00:00         4
76381         4 2017-01-01 04:00:00         2


# 3. Normalization

In [40]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

#Extracting HourOfDay
df_hourly['HourOfDay'] = df_hourly['DateTime'].dt.hour

# Define features
features = ['Vehicles', 'Junction', 'HourOfDay']

# Separate numeric and categorical features
numeric_features = ['Vehicles', 'HourOfDay']
categorical_features = ['Junction']

# Normalize numeric features
scaler = MinMaxScaler()
df_hourly[numeric_features] = scaler.fit_transform(df_hourly[numeric_features])

# Encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(df_hourly[categorical_features])
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_features))

# Concatenate normalized numeric and encoded categorical features
df_processed = pd.concat([df_hourly[numeric_features], encoded_categorical_df], axis=1)

# Check the processed data
print(df_processed.head())

   Vehicles  HourOfDay  Junction_1  Junction_2  Junction_3  Junction_4
0  0.254237   0.000000         1.0         0.0         0.0         0.0
1  0.220339   0.043478         1.0         0.0         0.0         0.0
2  0.169492   0.086957         1.0         0.0         0.0         0.0
3  0.118644   0.130435         1.0         0.0         0.0         0.0
4  0.152542   0.173913         1.0         0.0         0.0         0.0


### Data is processed for comprisons accross different timeperiods and junctions

# 4. Feature Engineering and Selection
### i. TIme based features: hour of the day, day of the week, month
### ii. lag features: including traffic data from previous hours or days to capture temporal dependencies.
### iii. binary indicators for weekends and special events to account for their impact on traffic.

In [41]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
# Extract time-based features
df_hourly['HourOfDay'] = df_hourly['DateTime'].dt.hour
df_hourly['DayOfWeek'] = df_hourly['DateTime'].dt.dayofweek
df_hourly['Month'] = df_hourly['DateTime'].dt.month

# Lag features
df_hourly['PrevHourVehicles'] = df_hourly['Vehicles'].shift(1)
df_hourly['PrevDayVehicles'] = df_hourly['Vehicles'].shift(24)  # assuming hourly data

# Binary indicators
df_hourly['IsWeekend'] = df_hourly['DateTime'].dt.dayofweek >= 5

# Drop rows with NaN values generated by lag features
df_hourly.dropna(inplace=True)

# Feature and target columns
features = ['HourOfDay', 'DayOfWeek', 'Month', 'PrevHourVehicles', 'PrevDayVehicles', 'IsWeekend']
target = 'Vehicles'

X = df_hourly[features]
y = df_hourly[target]


# 5. Evaluation

In [42]:
# Evaluate feature importance using a RandomForest model
model = RandomForestRegressor()
model.fit(X, y)

# Feature importance
feature_importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(importance_df)

            Feature  Importance
3  PrevHourVehicles    0.828275
0         HourOfDay    0.057449
4   PrevDayVehicles    0.051924
2             Month    0.034575
1         DayOfWeek    0.024438
5         IsWeekend    0.003340


### It's done. Exploratory Data Analysis is done