# **Predicting Seattle Resident's Customer Requests**

In [88]:
!pip install scikit-learn
!pip install lightgbm xgboost

In [134]:
# General
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Cross-validation
from sklearn.model_selection import KFold, cross_validate
import time

# Ignore Unhelpful Warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy.stats import randint, uniform


In [90]:
df = pd.read_csv("Customer_Service_Requests_20250426.csv", low_memory=False)
df.sample(5)

In [91]:
max_na = df.isna().sum().max()
total_count = df.shape[0]
percent_missing = (max_na / total_count) * 100

print(f"There are a total of {total_count} observations with {max_na} observations that has at least one feature with missing data.", end="\n")
print(f"{percent_missing:.2f}% of the data would be removed if we were to account for all features with missing data.")

In [92]:
df.isna().sum()

In [None]:
# df = df.dropna()

### Predict the Total Service Requests in the next 3 Months

In [94]:
# Convert Created Date column into a datetime object
df['Created Date'] = pd.to_datetime(df['Created Date'])

# Now you can safely extract all the time features
df['year'] = df['Created Date'].dt.year
df['month'] = df['Created Date'].dt.month
df['day'] = df['Created Date'].dt.day
df['day_of_week'] = df['Created Date'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['week_of_year'] = df['Created Date'].dt.isocalendar().week
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

df.sample(5)

In [105]:
Service_Type = df.groupby(['Service Request Type', 'year', 'month', 'day', 'day_of_week', 'week_of_year', 'is_weekend'])['Service Request Number'].count().reset_index()
Service_Type.rename(columns={'Service Request Number': 'Request Count'}, inplace=True)

Service_Type.sample(30)

In [113]:
# 1. Recreate a proper datetime column from year, month, day
Service_Type['Created Date'] = pd.to_datetime(Service_Type[['year', 'month', 'day']])

# 2. Sort correctly by Service Request Type and Date
Service_Type = Service_Type.sort_values(['Service Request Type', 'Created Date'])

# 3. Now create lag features based on Request Count
Service_Type['Daily_lag'] = Service_Type['Request Count'].shift(1)
Service_Type['Weekly_lag'] = Service_Type['Request Count'].shift(7)

# 4. Add rolling averages if you want
Service_Type['rolling_mean_7'] = Service_Type['Request Count'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)
Service_Type['rolling_std_7'] = Service_Type['Request Count'].rolling(window=7, min_periods=1).std().reset_index(0, drop=True)

Service_Type = Service_Type.fillna(0)
Service_Type[['Daily_lag', 'Weekly_lag']] = Service_Type[['Daily_lag', 'Weekly_lag']].astype(int)


Service_Type.sample(20)

Service_Type = Service_Type.drop(columns=['Created Date'])

Service_Type.sample(10)

safe_routes_df = Service_Type[
    Service_Type['Service Request Type'] == 'Safe Routes to School' # ) &
    # Service_Type['Request Count'] == 1
]

safe_routes_df.head()

In [127]:
service_types = Service_Type['Service Request Type'].unique()


model_df = pd.DataFrame(service_types, columns=['Service Request Type'])
model_df['Model'] = None
model_df['Mean Absolute Error'] = None
model_df['Mean Absolute Percentage Error'] = None
model_df['Latency'] = None

model_df.sample(10)

In [None]:
# cv_split = KFold(n_splits=5, shuffle=False)

param_dist = {
    'Linear Regression': {
        # very few hyper‐params here
        'fit_intercept': [True, False],
        'normalize':     [True, False]  
    },
    'Random Forest': {
        'n_estimators':    randint(50, 500),
        'max_depth':       randint(3, 20),
        'min_samples_split': randint(2, 10),
        'min_samples_leaf':  randint(1, 10),
        'max_features':    ['auto', 'sqrt', 'log2']
    },
    'LightGBM': {
        'n_estimators':    randint(50, 500),
        'num_leaves':      randint(10, 200),
        'learning_rate':   uniform(0.01, 0.3),
        'min_child_samples': randint(5, 50),
        'subsample':       uniform(0.5, 0.5)
    },
    'XGBoost': {
        'n_estimators':    randint(50, 500),
        'max_depth':       randint(3, 20),
        'learning_rate':   uniform(0.01, 0.3),
        'subsample':       uniform(0.5, 0.5),
        'colsample_bytree':uniform(0.5, 0.5)
    }
}

# 3) Set up your CV splitter & random search kwargs
cv_split = KFold(n_splits=5, shuffle=True, random_state=42)
search_kwargs = dict(
    scoring='neg_mean_absolute_error',
    cv=cv,
    n_iter=20,            # number of random draws
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# 4) Run RandomizedSearchCV for each model
best_searches = {}
for name, model in models.items():
    print(f"\n▶️ Tuning {name}...")
    rs = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist[name],
        **search_kwargs
    )
    rs.fit(X_train, y_train)            # use your training set here
    best_searches[name] = rs
    print(f"→ {name} best MAE = {-rs.best_score_:.3f}")
    print(f"  best params = {rs.best_params_}")

# 5) Extract the best estimators & retrain / evaluate on your hold-out test set
for name, rs in best_searches.items():
    print(f"\n{name} on TEST set:")
    best = rs.best_estimator_
    y_pred = best.predict(X_hot_test)
    mae = np.mean(np.abs(y_test - y_pred))
    print(f"  Test MAE = {mae:.3f}")

for service in service_types:
    # List of models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(),
        'LightGBM': LGBMRegressor(verbosity=-1),
        'XGBoost': XGBRegressor(verbosity=0)
    }

    eval = {model: {'mae': None, 'latency': None} for model in models.keys()}

    # Filter the data for the current service type
    service_data = Service_Type[Service_Type['Service Request Type'] == service].drop(columns=['Service Request Type'])
    X = service_data.drop(columns=['Request Count'])
    y = service_data['Request Count']

    split_point = int(0.8 * len(X))
    X_train = X.iloc[:split_point]
    X_test = X.iloc[split_point:]
    y_train = y.iloc[:split_point]
    y_test = y.iloc[split_point:]

    # Check if enough samples for 5-fold CV
    if len(X_train) < 5:
        print(f"Skipping {service}: only {len(X_train)} samples.")
        continue

    for model_name, model in models.items():

        mean_absolute_errors, mean_absolute_percentage_errors, latencies = [], [], []

        for idx_train, idx_val in cv_split.split(X_train, y_train):
            # Split the data into training and validation sets
            X_train_f = X_train.iloc[idx_train]
            X_val = X_train.iloc[idx_val]
            y_train_f = y_train.iloc[idx_train]
            y_val = y_train.iloc[idx_val]

            # Train the model
            start_time = time.time()
            model.fit(X_train_f, y_train_f)

            # Predict and evaluate
            y_pred = model.predict(X_val)
            end_time = time.time()

            latency = end_time - start_time

            # Store metrics for the model
            mae = np.mean(np.abs(y_val - y_pred))
            mape = np.mean(np.abs(100.0 * (y_val - y_pred) / y_val))
            mean_absolute_errors.append(mae)
            mean_absolute_percentage_errors.append(mape)
            latencies.append(latency)

        # Store averaged metrics for the model
        eval[model_name]['mae'] = np.mean(mean_absolute_errors)
        eval[model_name]['mape'] = np.mean(mean_absolute_percentage_errors)
        eval[model_name]['latency'] = np.mean(latencies)

    # Find the best model for the current service type
    best_model = min(eval, key=lambda x: eval[x]['mape'])

    # Update model_df cleanly
    model_df.loc[model_df['Service Request Type'] == service, 'Model'] = best_model
    model_df.loc[model_df['Service Request Type'] == service, 'Mean Absolute Error'] = eval[best_model]['mae']
    model_df.loc[model_df['Service Request Type'] == service, 'Mean Absolute Percentage Error'] = eval[best_model]['mape']
    model_df.loc[model_df['Service Request Type'] == service, 'Latency'] = eval[best_model]['latency']

    for service_model in model_df['Service Request Type']:
        print(f"Service: {service_model}, Model: {model_df.loc[model_df['Service Request Type'] == service_model, 'Model'].values[0]}, \
              MAPE: {model_df.loc[model_df['Service Request Type'] == service_model, 'Mean Absolute Percentage Error'].values[0]:.2f}%, \
              Latency: {model_df.loc[model_df['Service Request Type'] == service_model, 'Latency'].values[0]:.2f} seconds")

    # Tune Hyperparameters
    for service_model in model_df['Service Request Type']:

        param_dist = {
            'n_estimators': randint(100, 500),
            'max_depth': [None] + list(range(10, 50, 10)),
            'min_samples_split': randint(2, 10),
            'min_samples_leaf': randint(1, 5),
            'max_features': ['sqrt', 'log2'],
            'bootstrap': [True, False]
        }

        rf = RandomizedSearchCV(
            rf,
            param_distributions=param_dist,
            n_iter=20,
            cv=5,
            scoring='accuracy',
            random_state=42,
            n_jobs=-1
        )

        # Recalibrate metrics using test set
        rf.fit(X_train_lbl, y_train_lbl)

        start_time = time.time()
        rf_pred = rf.predict(X_test_lbl)
        end_time = time.time()

        tn, fp, fn, tp = confusion_matrix(y_test_lbl, rf_pred).ravel()
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        latency = end_time - start_time

In [135]:
# 1) Define your base models + their param grids once, _outside_ the service‐loop
base_models = {
    'Linear Regression': (
        LinearRegression(),
        {
            'fit_intercept': [True, False],
            'normalize':     [True, False]
        }
    ),
    'Random Forest': (
        RandomForestRegressor(random_state=42),
        {
            'n_estimators':       randint(50, 300),
            'max_depth':          randint(3, 20),
            'min_samples_split':  randint(2, 10),
            'min_samples_leaf':   randint(1, 10),
            'max_features':       ['auto','sqrt','log2']
        }
    ),
    'LightGBM': (
        LGBMRegressor(verbosity=-1, random_state=42),
        {
            'n_estimators':       randint(50, 300),
            'num_leaves':         randint(10, 200),
            'learning_rate':      uniform(0.01, 0.3),
            'min_child_samples':  randint(5, 50),
            'subsample':          uniform(0.5, 0.5)
        }
    ),
    'XGBoost': (
        XGBRegressor(verbosity=0, random_state=42),
        {
            'n_estimators':       randint(50, 300),
            'max_depth':          randint(3, 20),
            'learning_rate':      uniform(0.01, 0.3),
            'subsample':          uniform(0.5, 0.5),
            'colsample_bytree':   uniform(0.5, 0.5)
        }
    ),
}

# 2) In your per‐service loop, replace the simple fit() with a RandomizedSearchCV fit:
for service in service_types:
    # pull out X_train_service, y_train_service, etc. as you already do…
    # skip if <5 samples…

    for model_name, (estimator, param_grid) in base_models.items():
        # set up the RandomizedSearch over that one model
        rs = RandomizedSearchCV(
            estimator=estimator,
            param_distributions=param_grid,
            n_iter=20,
            cv=5,
            scoring='neg_mean_absolute_error',
            random_state=42,
            n_jobs=-1,
            verbose=0
        )

        # fit the search on your _filtered_ service‐type data
        rs.fit(X_train_service, y_train_service)

        # grab the best estimator & evaluate on your hold‐out fold(s)
        best = rs.best_estimator_
        y_pred = best.predict(X_test_service)   # or inside your CV if you want CV‐based MAPE
        mae  = np.mean(np.abs(y_test_service - y_pred))
        mape = np.mean(np.abs(100*(y_test_service - y_pred)/y_test_service))
        lat  = rs.refit_time_  # or measure predict time yourself

        # stash into model_df
        model_df.loc[
            (model_df['Service Request Type']==service) &
            (model_df['Model']==model_name),
            ['Mean Absolute Error','Mean Absolute Percentage Error','Latency']
        ] = [mae, mape, lat]

    print(f"Finished tuning all models for {service}")

In [133]:
model_df = model_df.loc[model_df['Model'].notna(), :]
model_df