In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
all_data = pd.read_csv("cleaned_parking_tickets.csv")

train_df = 



In [None]:

# -------------------------------
# 2. Feature extraction
# -------------------------------

# Split IssuedDate into Date and Time
for df in [train_df, test_df]:
    df[['Date', 'Time']] = df['IssuedDate'].str.split(',', expand=True)
    df['Date'] = pd.to_datetime(df['Date'].str.strip(), errors='coerce')
    df['Hour'] = pd.to_datetime(df['Time'].str.strip(), format='%I:%M %p', errors='coerce').dt.hour
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

# Drop original date/time columns
train_df.drop(columns=['IssuedDate', 'ViolationDescription', 'Date', 'Time'], inplace=True)
test_df.drop(columns=['IssuedDate', 'ViolationDescription', 'Date', 'Time'], inplace=True)

# -------------------------------
# 3. Frequency encoding for high-cardinality categorical features
# -------------------------------
categorical_features = ['StreetName', 'DayOfWeek', 'TimePeriod']

for col in categorical_features:
    freq = train_df[col].value_counts() / len(train_df)
    train_df[col + '_freq'] = train_df[col].map(freq)
    test_df[col + '_freq']  = test_df[col].map(freq).fillna(0)  # unseen categories → 0

# -------------------------------
# 4. Select final features
# -------------------------------
numeric_features = ['Year', 'Quarter', 'Hour', 'Month', 'Day']
encoded_features = [col + '_freq' for col in categorical_features]

features = numeric_features + encoded_features

X_train = train_df[features]
X_test  = test_df[features]

# -------------------------------
# 5. Scale numerical features
# -------------------------------
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features]  = scaler.transform(X_test[numeric_features])

# -------------------------------
# 6. Train Isolation Forest
# -------------------------------
model = IsolationForest(
    n_estimators=100,
    contamination=0.05,  # assume 5% anomalies (adjust as needed)
    random_state=42
)

model.fit(X_train)

# -------------------------------
# 7. Predict anomalies on test set
# -------------------------------
# -1 = anomaly (unlikely ticket), 1 = normal (likely ticket)
test_preds = model.predict(X_test)

# Anomaly scores (higher = more normal)
test_scores = model.decision_function(X_test)

test_df['anomaly'] = test_preds
test_df['anomaly_score'] = test_scores

# -------------------------------
# 8. Inspect results
# -------------------------------
print(test_df[['StreetName', 'DayOfWeek', 'TimePeriod', 'Hour', 'anomaly', 'anomaly_score']].head())

# Example: count how many anomalies detected
print("\nNumber of anomalies detected in test set:", (test_df['anomaly'] == -1).sum())
