In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# Load the dataset with no header
df = pd.read_csv("nyc_311.csv", header=None)

# Extract relevant columns by index (complaint type and timestamp)
df = df[[3, 1]]

# Convert the timestamp to a datetime object
df[1] = pd.to_datetime(df[1])

# Define a function to categorize seasons
def get_season(month):
    if 3 <= month <= 5:
        return "Spring"
    elif 6 <= month <= 8:
        return "Summer"
    elif 9 <= month <= 11:
        return "Fall"
    else:
        return "Winter"

# Create a new column for seasons
df['Season'] = df[1].dt.month.apply(get_season)

# Group the data by complaint type and season and count frequencies
complaints_by_season = df.groupby([0, 'Season']).size().unstack(fill_value=0)

# Perform a chi-squared test to check for statistical significance
chi2, p, _, _ = chi2_contingency(complaints_by_season)

# Print the p-value
print(f"P-value from chi-squared test: {p}")

# Create a heatmap to visualize the differences
plt.figure(figsize=(10, 6))
sns.heatmap(complaints_by_season, annot=True, fmt='d', cmap='YlGnBu')
plt.title("Noise Complaints by Type and Season")
plt.xlabel("Season")
plt.ylabel("Complaint Type")
plt.show()


ModuleNotFoundError: No module named 'scipy'