# Smart City Traffic Patterns Recognition Using Machine Learning

In [1]:
#library
import pandas as pd

## Data Understanding

In [2]:
# Reading the dataset
df = pd.read_csv("train.csv")


In [3]:
# Dataset preview
print(df.head())


              DateTime  Junction  Vehicles           ID
0  2015-11-01 00:00:00         1        15  20151101001
1  2015-11-01 01:00:00         1        13  20151101011
2  2015-11-01 02:00:00         1        10  20151101021
3  2015-11-01 03:00:00         1         7  20151101031
4  2015-11-01 04:00:00         1         9  20151101041


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB
None


## Data Preprocessing

### DateTime feature data type transformation

Changing the data type of the DateTime feature in the dataset from object to datetime will result in easier preprocessing of the data.

In [5]:
df["DateTime"]= pd.to_datetime(df["DateTime"])

### Feature Engineering

In [6]:
# Creating new features based on the DateTime feature
df["Year"] = df["DateTime"].dt.year
df["Month"] = df["DateTime"].dt.month
df['Day'] = df['DateTime'].dt.day
df['DayOfWeek'] = df['DateTime'].dt.strftime('%A')
df['Hour'] = df['DateTime'].dt.hour

# Number of vehicles during previous hour/ previous week

df['PrevHourVehicles'] = df['Vehicles'].shift(1)
df['PrevDayVehicles'] = df['Vehicles'].shift(24)
df['PrevWeekVehicles'] = df['Vehicles'].shift(24 * 7)

In [7]:
df

Unnamed: 0,DateTime,Junction,Vehicles,ID,Year,Month,Day,DayOfWeek,Hour,PrevHourVehicles,PrevDayVehicles,PrevWeekVehicles
0,2015-11-01 00:00:00,1,15,20151101001,2015,11,1,Sunday,0,,,
1,2015-11-01 01:00:00,1,13,20151101011,2015,11,1,Sunday,1,15.0,,
2,2015-11-01 02:00:00,1,10,20151101021,2015,11,1,Sunday,2,13.0,,
3,2015-11-01 03:00:00,1,7,20151101031,2015,11,1,Sunday,3,10.0,,
4,2015-11-01 04:00:00,1,9,20151101041,2015,11,1,Sunday,4,7.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
48115,2017-06-30 19:00:00,4,11,20170630194,2017,6,30,Friday,19,17.0,9.0,17.0
48116,2017-06-30 20:00:00,4,30,20170630204,2017,6,30,Friday,20,11.0,13.0,17.0
48117,2017-06-30 21:00:00,4,16,20170630214,2017,6,30,Friday,21,30.0,12.0,10.0
48118,2017-06-30 22:00:00,4,22,20170630224,2017,6,30,Friday,22,16.0,15.0,10.0


### Missing Values

In [8]:
missing_values = df.isna().sum()
missing_values

DateTime              0
Junction              0
Vehicles              0
ID                    0
Year                  0
Month                 0
Day                   0
DayOfWeek             0
Hour                  0
PrevHourVehicles      1
PrevDayVehicles      24
PrevWeekVehicles    168
dtype: int64

## Exploratory Data Analysis

In [9]:
df.describe()

Unnamed: 0,DateTime,Junction,Vehicles,ID,Year,Month,Day,Hour,PrevHourVehicles,PrevDayVehicles,PrevWeekVehicles
count,48120,48120.0,48120.0,48120.0,48120.0,48120.0,48120.0,48120.0,48119.0,48096.0,47952.0
mean,2016-09-19 06:03:56.109725696,2.180549,22.791334,20163300000.0,2016.269825,5.884289,15.700748,11.5,22.791558,22.797052,22.838297
min,2015-11-01 00:00:00,1.0,1.0,20151100000.0,2015.0,1.0,1.0,0.0,1.0,1.0,1.0
25%,2016-04-16 01:45:00,1.0,9.0,20160420000.0,2016.0,3.0,8.0,5.75,9.0,9.0,9.0
50%,2016-09-30 03:30:00,2.0,15.0,20160930000.0,2016.0,5.0,16.0,11.5,15.0,15.0,15.0
75%,2017-02-25 16:00:00,3.0,29.0,20170230000.0,2017.0,9.0,23.0,17.25,29.0,29.0,30.0
max,2017-06-30 23:00:00,4.0,180.0,20170630000.0,2017.0,12.0,31.0,23.0,180.0,180.0,180.0
std,,0.966955,20.750063,5944854.0,0.616093,3.569872,8.784073,6.922258,20.75022,20.753205,20.768866
