In [4]:
!pip install astral

Collecting astral
  Downloading astral-3.2-py3-none-any.whl.metadata (1.7 kB)
Downloading astral-3.2-py3-none-any.whl (38 kB)
Installing collected packages: astral
Successfully installed astral-3.2


In [7]:
# 📦 Required libraries
import pandas as pd
import numpy as np
import datetime as dt
from astral import LocationInfo
from astral.sun import daylight
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score

# 📁 Unzip data
!unzip -q "MelihDurhasanDSA210SprintermProject-main (1).zip"
!unzip -q "/content/project/MelihDurhasanDSA210SprintermProject-main/DSA210_datas.zip"

# 🔹 Crime data
crime = pd.read_csv("crime-data.csv", parse_dates=["Date"], low_memory=False)
crime = crime[(crime["Date"].dt.year >= 2020) & (crime["Date"].dt.year <= 2024)]
crime = crime.dropna(subset=["Community Area"])
crime["date_only"] = crime["Date"].dt.date

# 🔹 Daily crime counts per area
daily = crime.groupby(["date_only", "Community Area"]).size().reset_index(name="crime_cnt")

# 🔹 Weather data
weather = pd.read_csv("weather-data.csv", parse_dates=["datetime"])
weather["date_only"] = weather["datetime"].dt.date
wx = (weather.groupby("date_only")
      .agg(temp=("temp", "mean"),
           humidity=("humidity", "mean"),
           precip=("precip", "sum"),
           windspeed=("windspeed", "mean"),
           visibility=("visibility", "mean"),
           uvindex=("uvindex", "mean"))
      .reset_index())

# 🔹 Population data
pop = pd.read_csv("chicago_population_named.csv")

# 🔹 Merge datasets
df = (daily.merge(wx, on="date_only", how="left")
          .merge(pop[["Community Area", "population"]], on="Community Area", how="left"))

# 🔹 Feature engineering
chi = LocationInfo(41.88, -87.63)
df["crime_rate"] = df["crime_cnt"] / (df["population"] / 100_000)
df["day_name"] = pd.to_datetime(df["date_only"]).dt.day_name()
df["month"] = pd.to_datetime(df["date_only"]).dt.month
holidays = USFederalHolidayCalendar().holidays(start=df["date_only"].min(), end=df["date_only"].max())
df["is_holiday"] = df["date_only"].isin(holidays)
df["daylight_hrs"] = df["date_only"].apply(
    lambda d: (daylight(chi.observer, date=d)[1] - daylight(chi.observer, date=d)[0]).seconds / 3600
)

# 🔹 Lag features
df = df.sort_values(["Community Area", "date_only"])
df["lag_1"] = df.groupby("Community Area")["crime_rate"].shift(1).fillna(0)
df["lag_7"] = df.groupby("Community Area")["crime_rate"].shift(7).fillna(0)

# 🔹 Encoding & model inputs
le_day = LabelEncoder()
df["day_enc"] = le_day.fit_transform(df["day_name"])
feat_cols = [
    "temp", "humidity", "precip", "windspeed", "visibility", "uvindex",
    "daylight_hrs", "is_holiday", "month", "day_enc", "lag_1", "lag_7", "Community Area"
]
X = df[feat_cols]
y = df["crime_rate"]
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

# 🔹 Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ▶️ Gradient Boosting (baseline)
gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print(f"GB R²: {r2_score(y_test, y_pred_gb):.3f}, MAE: {mean_absolute_error(y_test, y_pred_gb):.2f}")

# ▶️ Regression modelleri karşılaştırma
reg_models = {
    "Linear Regression": LinearRegression(),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
}

print("\n=== Regression Model Performance ===")
for name, model in reg_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name:16s} → R²: {r2_score(y_test, y_pred):.3f}, MAE: {mean_absolute_error(y_test, y_pred):.2f}")

# ▶️ Logistic Regression sınıflandırma (crime_rate medyan üstü/altı)
y_binary = (y > y.median()).astype(int)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_scaled, y_binary, test_size=0.2, random_state=42
)
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_cls, y_train_cls)
acc = accuracy_score(y_test_cls, log_model.predict(X_test_cls))
print(f"\nLogistic Regression → Accuracy: {acc:.3f}")

# ▶️ KMeans kümeleme
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X_scaled)
print("\nKMeans → Cluster dağılımı:")
print(df["cluster"].value_counts())


unzip:  cannot find or open MelihDurhasanDSA210SprintermProject-main (1).zip, MelihDurhasanDSA210SprintermProject-main (1).zip.zip or MelihDurhasanDSA210SprintermProject-main (1).zip.ZIP.
unzip:  cannot find or open /content/project/MelihDurhasanDSA210SprintermProject-main/DSA210_datas.zip, /content/project/MelihDurhasanDSA210SprintermProject-main/DSA210_datas.zip.zip or /content/project/MelihDurhasanDSA210SprintermProject-main/DSA210_datas.zip.ZIP.
GB R²: 0.574, MAE: 2.76

=== Regression Model Performance ===
Linear Regression → R²: 0.495, MAE: 3.01
KNN Regressor    → R²: 0.416, MAE: 3.27
Random Forest    → R²: 0.547, MAE: 2.88
Decision Tree    → R²: 0.110, MAE: 3.59

Logistic Regression → Accuracy: 0.723

KMeans → Cluster dağılımı:
cluster
3    29220
1    16146
0    15852
4     6635
2     3246
Name: count, dtype: int64
