In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import plotly.graph_objs as go

In [2]:
# Load the dataset
file_path = "Resources/pipeline_incidents.csv"

In [3]:
# Read our incident_df file with the Pandas library
incident_df = pd.read_csv(file_path, encoding="ISO-8859-1")
incident_df.head(5)

Unnamed: 0,Incident Number,Incident_Type,Date_Reported,Cause_Category,Cause_Agent,Population_Density,Province,Latitude,Longitude,Substance,...,Licensed_Max_Pressure,Actual_Operating_Pressure,Year_of_Manufacture,Year_of_Installation,Service_Year,Weld type,Seam_Type,Seam_Joining_Method,Coating_Location,Coating_Type
0,INC2008-115,Fire,12/02/2008,Equipment Failure,Maintenance,Low,British Columbia,56.63822,-121.65102,Not Applicable,...,,,,,,,,,,
1,INC2008-119,Release of Substance,12/08/2008,Defect and Deterioration,Maintenance,Low,British Columbia,55.032,-123.03098,Natural Gas - Sweet,...,,,,,,,,,,
2,INC2008-120,Release of Substance,12/09/2008,Equipment Failure,Maintenance,Low,Manitoba,49.75544,-97.2305,Natural Gas - Sweet,...,0.0,,,,,,,,,
3,INC2008-121,Adverse Environmental Effects,12/10/2008,Equipment Failure,Maintenance,Low,Nova Scotia,45.18253,-61.65236,Not Applicable,...,,,,,,,,,,
4,INC2008-123,Fire,12/17/2008,External Interference,Maintenance,Very High,Alberta,53.5474,-113.35571,Not Applicable,...,0.0,,,,,,,,,


In [4]:
# Convert Date_Reported to datetime and extract year
incident_df['Date_Reported'] = pd.to_datetime(incident_df['Date_Reported'], errors='coerce')
incident_df['Year_Reported'] = incident_df['Date_Reported'].dt.year

# Prepare the data for time series forecasting
incidents_per_year = incident_df['Year_Reported'].value_counts().sort_index()

historical_df_filtered = pd.DataFrame({
    'Year': incidents_per_year.index,
    'Incidents': incidents_per_year.values
}).query('Year < 2024')

# Define the ARIMA model for forecasting
model_filtered = ARIMA(historical_df_filtered['Incidents'], order=(2, 1, 2))
model_fit_filtered = model_filtered.fit()

# Forecast the number of incidents for the next 5 years (excluding 2024 from training data)
forecast_filtered = model_fit_filtered.forecast(steps=5)

# Prepare the forecasted years (2024 and onwards)
future_years_filtered = list(range(historical_df_filtered['Year'].max() + 1, historical_df_filtered['Year'].max() + 6))

# Combine forecasted data (rounded) into a DataFrame
forecast_df_filtered = pd.DataFrame({
    'Year': future_years_filtered,
    'Incidents': forecast_filtered.round().astype(int)
})

# Combine the historical and forecast data into one DataFrame
combined_df_filtered = pd.concat([historical_df_filtered, forecast_df_filtered])

# Create traces for historical and forecasted data
trace1 = go.Scatter(x=historical_df_filtered['Year'], y=historical_df_filtered['Incidents'],
                    mode='lines+markers', name='Historical Data')

trace2 = go.Scatter(x=future_years_filtered, y=forecast_filtered, 
                    mode='lines+markers', name='Forecasted Incidents', line=dict(dash='dash', color='orange'))

# Create layout for the plot
layout = go.Layout(
    title='Pipeline Incidents Forecast for the Next 5 Years',
    xaxis=dict(title='Year', dtick=1),
    yaxis=dict(title='Number of Incidents'),
    showlegend=True
)

# Combine traces and layout into a figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show the plot
fig.show()



  warn('Non-invertible starting MA parameters found.'
