# Import data and libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
caminho_arquivo = '../data/Metro_Interstate_Traffic_Volume.csv'
df = pd.read_csv(caminho_arquivo)

# Basic informations about the dataset

## Head (First Five Rows)

In [2]:
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


## Tail (Last Five Rows)

In [3]:
df.tail()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450
48203,,282.12,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 23:00:00,954


Please note that observations are made on an hourly basis, capturing real-time details such as temperature, precipitation, and weather conditions, with the primary metric being traffic volume.

## Shape (number of rows and columns)

In [4]:
print("\nDataset dimensions (rows, columns):")
df.shape


Dataset dimensions (rows, columns):


(48204, 9)

## Technical Informations

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


Note that the 'holiday' column only contains 61 non-null values because only a minority of days are holidays.  

## Holiday Column Analysis

In [None]:
# Display unique values in the holiday column
print("Unique values in holiday column:")
print(df['holiday'].unique())

# Count the frequency of each holiday
print("\nFrequency of each holiday:")
print(df['holiday'].value_counts())

# Visualizing the rows that contains holiday values for undestading why the frequency of each holiday is diferent
print("\nRows with holiday values:")
# for with to show the rows of each holiday
for holiday in df['holiday'].dropna().unique():
    print(f"\nHoliday: {holiday}")
    print(df[df['holiday'] == holiday])



""" 
# Calculate the percentage of null values
null_percentage = (df['holiday'].isnull().sum() / len(df)) * 100
print(f"\nPercentage of null values: {null_percentage:.2f}%")

# Group by holiday and calculate mean traffic volume
print("\nMean traffic volume by holiday:")
print(df.groupby('holiday')['traffic_volume'].mean().sort_values(ascending=False))

"""
df[df['holiday'].notnull()]


Unique values in holiday column:
[nan 'Columbus Day' 'Veterans Day' 'Thanksgiving Day' 'Christmas Day'
 'New Years Day' 'Washingtons Birthday' 'Memorial Day' 'Independence Day'
 'State Fair' 'Labor Day' 'Martin Luther King Jr Day']

Frequency of each holiday:
holiday
Labor Day                    7
Christmas Day                6
Thanksgiving Day             6
Martin Luther King Jr Day    6
New Years Day                6
Veterans Day                 5
Columbus Day                 5
Memorial Day                 5
Washingtons Birthday         5
State Fair                   5
Independence Day             5
Name: count, dtype: int64

Rows with holiday values:

Holiday: Columbus Day
            holiday     temp  rain_1h  snow_1h  clouds_all weather_main  \
126    Columbus Day  273.080      0.0      0.0          20       Clouds   
9455   Columbus Day  277.720      0.0      0.0           0        Clear   
18946  Columbus Day  293.020      0.0      0.0           1        Clear   
27224  Columbus

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
126,Columbus Day,273.080,0.00,0.0,20,Clouds,few clouds,2012-10-08 00:00:00,455
1123,Veterans Day,288.120,0.00,0.0,87,Clear,sky is clear,2012-11-12 00:00:00,1000
1370,Thanksgiving Day,278.540,0.00,0.0,20,Mist,mist,2012-11-22 00:00:00,919
2360,Christmas Day,264.400,0.00,0.0,90,Clouds,overcast clouds,2012-12-25 00:00:00,803
2559,New Years Day,263.490,0.00,0.0,58,Clouds,broken clouds,2013-01-01 00:00:00,1439
...,...,...,...,...,...,...,...,...,...
44441,Memorial Day,299.487,0.00,0.0,24,Clouds,few clouds,2018-05-28 00:00:00,1088
45547,Independence Day,297.550,0.00,0.0,1,Mist,mist,2018-07-04 00:00:00,1021
46936,State Fair,289.020,0.00,0.0,1,Clear,sky is clear,2018-08-23 00:00:00,596
47330,Labor Day,292.430,0.25,0.0,1,Rain,light rain,2018-09-03 00:00:00,962


Let's analyze what this tells us:
1. The holiday column contains specific holiday names for days that are holidays
2. Non-holiday days are represented as null values
3. This makes sense because most days in a year are not holidays
4. We can see how different holidays affect traffic volume