## Data Exploration 

In [1]:
%%capture
#Load the data_preparation_and_preprocessing file
%run data_preparation_and_preprocessing.ipynb

In [2]:
#Plot time series again
line_plot()

In [3]:
#Get data distribution for vehicles variable
def histogram(junction, row, col):
    fig.add_trace(go.Histogram(x=traffic_data[traffic_data["Junction"]==junction]["Vehicles"],
                               name=f"Junction {junction}"), row=row, col=col)
    fig.update_xaxes(title_text="Vehicles", row=row, col=col)
    fig.update_yaxes(title_text="Count", row=row, col=col)

In [4]:
#create subplots with 2 rows and 2 columns
fig = make_subplots(rows=2, cols=2)
#plot histogram for junction 1
histogram(1, 1, 1)
#plot histogram for junction 2
histogram(2, 1, 2)
#plot histogram for junction 3
histogram(3, 2, 1)
#plot histogram for junction 4
histogram(4, 2, 2)
fig.update_layout(title="Distribution of Vehicles by Junctions")
fig.show()

In [5]:
#Total vehicles passing all junctions per day
#Extract the day of the week
traffic_data["DayOfWeek"] = traffic_data["DateTime"].dt.day_name()

#Group the data by day of the week and count the number of occurrences
grouped_df = traffic_data.groupby("DayOfWeek")["Vehicles"].sum().reset_index()

#Create the bar chart
fig = px.bar(grouped_df, x="DayOfWeek", y="Vehicles", 
             title="Number of vehicles per each day of the week",
             category_orders={"DayOfWeek": ['Monday', 'Tuesday', 'Wednesday', 
                                            'Thursday', 'Friday', 'Saturday', 'Sunday']},
             width=900, height=450)
fig.show()

In [6]:
#Function for data by day for each junction
def bar_day(junction, row, col):
    #Extract the day of the week
    traffic_data["DayOfWeek"] = traffic_data["DateTime"].dt.day_name()

    #Start week by Monday
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ordered_weekdays = pd.Categorical(traffic_data["DayOfWeek"], categories=weekdays, ordered=True)
    traffic_data["DayOfWeek"] = ordered_weekdays

    #Group data by DayOfWeek and Junction
    grouped_df = traffic_data.groupby(["DayOfWeek", "Junction"]).sum().reset_index()
    grouped_df = grouped_df[grouped_df["Junction"] == junction]

    #Plot an interactive bar chart
    fig.add_trace(go.Bar(x=grouped_df["DayOfWeek"], y=grouped_df["Vehicles"], name=f"Junction {junction}"),
                  row=row, col=col)
    fig.update_xaxes(row=row, col=col)
    fig.update_yaxes(title_text="Total", row=row, col=col)

In [7]:
#Create a figure with 4 subplots
fig = make_subplots(rows=2, cols=2)
#Bar plot for junction 1
bar_day(1, 1, 1)
#Bar plot for junction 2
bar_day(2, 1, 2)
#Bar plot for junction 3
bar_day(3, 2, 1)
#Bar plot for junction 4
bar_day(4, 2, 2)
fig.update_layout(title="Total vehicles by junction")
fig.show()

In [8]:
#Total vehicles passing all junctions per hour
#Extract the hour of the day
traffic_data["HourOfDay"] = traffic_data["DateTime"].dt.hour

#Group the data by hour of the day and count the number of occurrences
grouped_df = traffic_data.groupby("HourOfDay")["Vehicles"].sum().reset_index()

# Create the bar chart
fig = px.bar(grouped_df, x="HourOfDay", y="Vehicles", 
             title="Number of vehicles per each day of the week",
             width=900, height=450)

fig.update_layout(xaxis=dict(tickmode="linear", tick0=1, dtick=1))
fig.show()

In [9]:
#Function for data by hour of the day for each junction
def bar_hour(junction, row, col):
    # Extract the hour of the day
    traffic_data["HourOfDay"] = traffic_data["DateTime"].dt.hour
    
    #Group data by HourOfDay and Junction
    grouped_df = traffic_data.groupby(["HourOfDay", "Junction"]).sum().reset_index()
    grouped_df = grouped_df[grouped_df["Junction"] == junction]
    
    # Plot an interactive bar chart
    fig.add_trace(go.Bar(x=grouped_df["HourOfDay"], y=grouped_df["Vehicles"], name=f"Junction {junction}",),
                  row=row, col=col)
    fig.update_xaxes(title_text="Hour of the day", row=row, col=col)
    fig.update_yaxes(title_text="Total", row=row, col=col)
#     fig.update_layout(showlegend=False)

In [10]:
#Create a figure with 4 subplots
fig = make_subplots(rows=2, cols=2)
#Bar plot for junction 1
bar_hour(1, 1, 1)
#Bar plot for junction 2
bar_hour(2, 1, 2)
#Bar plot for junction 3
bar_hour(3, 2, 1)
#Bar plot for junction 4
bar_hour(4, 2, 2)
fig.update_layout(title="Total vehicles by junction")
fig.show()

In [11]:
#Group the data by junction and count the number of occurrences
grouped_df = traffic_data.groupby("Junction")["Vehicles"].sum().reset_index()

#Create the bar chart
fig = px.bar(grouped_df, x="Junction", y="Vehicles",
             title="Total number of vehicles by junction",
             width=900, height=450)
fig.update_xaxes(tickvals=[1, 2, 3, 4])
fig.show()