In [47]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.impute import SimpleImputer
import joblib

In [48]:

# LOAD DATA
file_path = "/Users/clairelee/DS-4002-Project-2/DATA/Final/cleaned_parking_tickets.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)

# Split IssuedDate column 
df[['Date', 'Time']] = df['IssuedDate'].str.split(',', expand=True)

# Clean whitespace
df['Date'] = df['Date'].str.strip()
df['Time'] = df['Time'].str.strip()

# Parse datetime and extract components
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])  # remove rows with invalid dates

df['Hour'] = pd.to_datetime(df['Time'], format='%I:%M %p', errors='coerce').dt.hour
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Drop invalid times
df = df.dropna(subset=['Hour'])


# Create binary target variable: 1 = ticket issued, 0 = void # FIX THIS!!!!!!!!!!!
df['TicketIssued'] = np.where(df['ViolationDescription'].str.lower().str.contains('void'), 0, 1)

# Drop unused columns
df = df.drop(columns=['IssuedDate', 'ViolationDescription', 'Date', 'Time'])

# Verify result
print("\nAfter cleaning and feature extraction:")
print(df.head())

# Time Based Splits
def get_time_splits(df):
    splits = {}
    splits['all_time'] = df[df['Year'] <= 2023]
    splits['recent_time'] = df[df['Year'] == 2023]
    for end_year in range(2023, 1999, -5):
        start_year = end_year - 4 # 5 years
        label = f"{start_year}-{end_year}"
        splits[label] = df[(df['Year'] >= start_year) & (df['Year'] <= end_year)]
    return splits

splits = get_time_splits(df)

# Use 2024 as test data if available, otherwise 20% random split
test_df = df[df['Year'] == 2024]
if test_df.empty:
    print("\n⚠️ No 2024 data found — using 20% of the dataset as test data.")
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
else:
    train_df = df[df['Year'] <= 2023]

X_test = test_df.drop(columns=['TicketIssued'])
y_test = test_df['TicketIssued']

# Model
categorical_features = ['StreetName', 'DayOfWeek', 'TimePeriod', 'Quarter']
numerical_features = ['Hour', 'Month', 'Day']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


# Train

results = []

for label, data in splits.items():
    if data.shape[0] < 200:
        continue
    X_train = data.drop(columns=['TicketIssued'])
    y_train = data['TicketIssued']

    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    results.append({
        'TimeFrame': label,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec
    })

results_df = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)

print("\n--- Model Performance by Time Frame ---")
print(results_df)

# EXAMPLE????
sample_input = pd.DataFrame({
    'StreetName': ['14TH ST NW'],
    'DayOfWeek': ['Thursday'],
    'Quarter': [1],
    'TimePeriod': ['Morning'],
    'Hour': [9],
    'Month': [1],
    'Day': [27],
    'Year': [2022]  # not used directly but for structure
})

ticket_pred = model_pipeline.predict(sample_input)
prob = model_pipeline.predict_proba(sample_input)[0][1]

print("\n--- Example Prediction ---")
print(f"Prediction: {'Ticket likely' if ticket_pred[0]==1 else 'No ticket likely'} (Probability: {prob:.2f})")



Initial shape: (160454, 7)

After cleaning and feature extraction:
   Year  DayOfWeek  Quarter          StreetName TimePeriod  Hour  Month  Day  \
0  2015     Friday        4          W WATER ST    Morning     9     10   30   
1  2022   Thursday        1          14TH ST NW    Morning     9      1   27   
2  2022  Wednesday        3           5TH ST SW  Afternoon    12      7   27   
3  2022   Thursday        4  JEFFERSON PARK AVE    Morning    11     10   13   
4  2021    Tuesday        2            1ST ST S    Morning     8      6   29   

   TicketIssued  
0             0  
1             1  
2             0  
3             1  
4             0  

--- Model Performance by Time Frame ---
     TimeFrame  Accuracy  Precision  Recall
0     all_time  0.941725   0.941725     1.0
1  recent_time  0.941725   0.941725     1.0
2    2019-2023  0.941725   0.941725     1.0
3    2014-2018  0.941725   0.941725     1.0
4    2009-2013  0.941725   0.941725     1.0

--- Example Prediction ---
Prediction: