In [1]:
import pickle as pkl
import pandas as pd
with open('Project-3_NYC_311_Calls.pkl', "rb") as f:
    object = pkl.load(f)

In [2]:
data = pd.DataFrame(object)

In [28]:
# Convert datetime format
data['Created Date'] = pd.to_datetime(data['Created Date'], errors='coerce')

# Filt year 2022
data_2022 = data[data['Created Date'].dt.year == 2022]

# Resample (daily)
daily_complaints_2022 = data_2022.resample('D', on='Created Date')['Unique Key'].count()

# Calculate average
average_daily_complaints_2022 = daily_complaints_2022.mean()
average_daily_complaints_2022


8684.320547945206

In [29]:
# Find the date with the maximum number of complaints
max_complaints_date = daily_complaints.idxmax()
max_complaints_count = daily_complaints.max()

max_complaints_date, max_complaints_count


(Timestamp('2020-08-04 00:00:00'), 24415)

In [30]:
# Filter data for the date with the maximum number of complaints
data_max_complaints = data[data['Created Date'].dt.date == max_complaints_date.date()]

# Count the number of each type of complaint on this date
most_common_complaint = data_max_complaints['Complaint Type'].value_counts().idxmax()
most_common_complaint_count = data_max_complaints['Complaint Type'].value_counts().max()

most_common_complaint, most_common_complaint_count


('Damaged Tree', 14863)

In [32]:
# Month
monthly_complaints_by_month = data.groupby(data['Created Date'].dt.month)['Unique Key'].count()

# Find the month with the fewest number of complaints
quietest_month_by_month = monthly_complaints_by_month.idxmin()
fewest_complaints_by_month = monthly_complaints_by_month.min()

quietest_month_by_month, fewest_complaints_by_month


(12, 2596986)

In [3]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Resample (daily)
daily_complaints_full = data.resample('D', on='Created Date')['Unique Key'].count().fillna(0)
# Perform ETS decomposition
ets_decomposition = seasonal_decompose(daily_complaints_full, model='additive', period=7)
# Extract the seasonal component
seasonal_component = ets_decomposition.seasonal
# Value of 2020-12-25
seasonal_value_2020_12_25 = seasonal_component['2020-12-25']
seasonal_value_2020_12_25, round(seasonal_value_2020_12_25)


(182.69763790386224, 183)

In [39]:
autocorrelation_lag_1 = daily_complaints_full.autocorr(lag=1)
autocorrelation_lag_1

0.7517059728398578

In [8]:
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np

prophet_df = daily_complaints_full.reset_index().rename(columns={'Created Date': 'ds', 'Unique Key': 'y'})

# Split training and testing sets
train_df = prophet_df.iloc[:-90]
test_df = prophet_df.iloc[-90:]

# Fit Prophet model
model = Prophet()
model.fit(train_df)

# Predict
future = model.make_future_dataframe(periods=90)
forecast = model.predict(future)


# RMSE
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(test_df['y'], forecast['yhat'][-90:]))

rmse

00:17:09 - cmdstanpy - INFO - Chain [1] start processing
00:17:09 - cmdstanpy - INFO - Chain [1] done processing


1231.3215094072907

In [9]:
# Summary of the dataset 
date_range = (data['Created Date'].min(), data['Created Date'].max())

# Types of complaints and the most common ones
complaint_types = data['Complaint Type'].value_counts()

# Resolution descriptions
sample_resolutions = data['Resolution Description'].sample(5, random_state=1)

# General information including NaN values
data_info = data.info()
data_description = data.describe(include='all')

date_range, complaint_types.head(), sample_resolutions, data_info, data_description


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33780977 entries, 0 to 33780976
Data columns (total 12 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   Unique Key              int64         
 1   Created Date            datetime64[ns]
 2   Agency                  object        
 3   Agency Name             object        
 4   Complaint Type          object        
 5   Descriptor              object        
 6   Location Type           object        
 7   Incident Zip            object        
 8   City                    object        
 9   Resolution Description  object        
 10  Borough                 object        
 11  Open Data Channel Type  object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 3.0+ GB


((Timestamp('2010-01-01 00:00:00'), Timestamp('2023-08-04 12:00:00')),
 Complaint Type
 Noise - Residential    3131834
 Illegal Parking        2110646
 HEAT/HOT WATER         1983520
 Blocked Driveway       1439795
 Street Condition       1212154
 Name: count, dtype: int64,
 14137788      The Department of Sanitation removed the items.
 21252101    The Department of Housing Preservation and Dev...
 12431738    The Department of Sanitation collected the E-w...
 16032665    The Police Department responded to the complai...
 15422914    The Police Department responded to the complai...
 Name: Resolution Description, dtype: object,
 None,
           Unique Key                   Created Date    Agency  \
 count   3.378098e+07                       33780977  33780977   
 unique           NaN                            NaN        36   
 top              NaN                            NaN      NYPD   
 freq             NaN                            NaN  10038478   
 mean    3.802665e+07  2017