In [2]:
import pandas as pd # type: ignore
import numpy as np#type: ignore
from sklearn.model_selection import train_test_split#type: ignore
from sklearn.preprocessing import LabelEncoder#type: ignore
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier#type: ignore
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report#type: ignore



In [4]:
file_path = 'data.csv'
restaurant_data = pd.read_csv(file_path)

columns_to_drop = [
    'business_phone_number', 'business_location', 'Neighborhoods',
    'SF Find Neighborhoods', 'Current Police Districts', 
    'Current Supervisor Districts', 'Analysis Neighborhoods'
]
restaurant_data = restaurant_data.drop(columns=columns_to_drop)

restaurant_data = restaurant_data.dropna()

for col in ['inspection_type', 'business_city', 'business_state']:
    le = LabelEncoder()
    restaurant_data[col] = le.fit_transform(restaurant_data[col].astype(str))



In [6]:
restaurant_data['inspection_date'] = pd.to_datetime(restaurant_data['inspection_date'], errors='coerce')
restaurant_data['inspection_year'] = restaurant_data['inspection_date'].dt.year
restaurant_data['inspection_month'] = restaurant_data['inspection_date'].dt.month
restaurant_data['inspection_day'] = restaurant_data['inspection_date'].dt.day

# Filter score and risk data, dropping unnecessary columns
score_data = restaurant_data.dropna(subset=['inspection_score'])
features_score = score_data.drop(columns=['inspection_score', 'inspection_date', 'risk_category', 'violation_id', 'violation_description', 'business_name', 'business_id'])

risk_data = restaurant_data.dropna(subset=['risk_category'])
features_risk = risk_data.drop(columns=['risk_category', 'inspection_date', 'inspection_score', 'violation_id', 'violation_description', 'business_name', 'business_id'])

# Apply get_dummies to convert categorical data to numeric
features_score = pd.get_dummies(features_score)
features_risk = pd.get_dummies(features_risk)

# Align test data columns to training data for `features_score`
X_train_score, X_test_score, y_train_score, y_test_score = train_test_split(
    features_score, score_data['inspection_score'], test_size=0.9, random_state=42
)
X_test_score = X_test_score.reindex(columns=X_train_score.columns, fill_value=0)

# Align test data columns to training data for `features_risk`
X_train_risk, X_test_risk, y_train_risk, y_test_risk = train_test_split(
    features_risk, risk_data['risk_category'], test_size=0.9, random_state=42
)
X_test_risk = X_test_risk.reindex(columns=X_train_risk.columns, fill_value=0)

# Prepare data for good quality prediction
GOOD_QUALITY_THRESHOLD = 90
restaurant_data['is_good_quality'] = np.where(restaurant_data['inspection_score'] >= GOOD_QUALITY_THRESHOLD, 1, 0)
restaurant_data['good_quality_next_year'] = restaurant_data.sort_values(by=['business_id', 'inspection_date']) \
    .groupby('business_id')['is_good_quality'].shift(-1)
restaurant_data = restaurant_data.dropna(subset=['good_quality_next_year'])

features_quality_prediction = restaurant_data.drop(columns=[
    'inspection_score', 'inspection_date', 'is_good_quality', 'violation_id', 
    'violation_description', 'good_quality_next_year', 'business_name', 'business_id'
])
features_quality_prediction = pd.get_dummies(features_quality_prediction)
target_quality_prediction = restaurant_data['good_quality_next_year']

# Align test data columns to training data for `features_quality_prediction`
X_train_quality, X_test_quality, y_train_quality, y_test_quality = train_test_split(
    features_quality_prediction, target_quality_prediction, test_size=0.9, random_state=42
)
X_test_quality = X_test_quality.reindex(columns=X_train_quality.columns, fill_value=0)

  restaurant_data['inspection_date'] = pd.to_datetime(restaurant_data['inspection_date'], errors='coerce')


In [8]:
# Step 6: Train and Evaluate Models

# Model 1: Inspection Score Prediction (Regression)
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_score, y_train_score)
y_pred_score = regressor.predict(X_test_score)
print("Inspection Score Prediction - MAE:", mean_absolute_error(y_test_score, y_pred_score))
print("Inspection Score Prediction - RMSE:", np.sqrt(mean_squared_error(y_test_score, y_pred_score)))


Inspection Score Prediction - MAE: 4.659522872602065
Inspection Score Prediction - RMSE: 6.344301205015721


In [10]:
# Model 2: Violation Risk Classification
classifier_risk = RandomForestClassifier(random_state=42)
classifier_risk.fit(X_train_risk, y_train_risk)
y_pred_risk = classifier_risk.predict(X_test_risk)
print("Violation Risk Classification - Accuracy:", accuracy_score(y_test_risk, y_pred_risk))
print("Violation Risk Classification - Report:\n", classification_report(y_test_risk, y_pred_risk))


Violation Risk Classification - Accuracy: 0.460949882494398
Violation Risk Classification - Report:
                precision    recall  f1-score   support

    High Risk       0.18      0.05      0.08      2543
     Low Risk       0.50      0.76      0.60      8919
Moderate Risk       0.39      0.23      0.28      6835

     accuracy                           0.46     18297
    macro avg       0.36      0.35      0.32     18297
 weighted avg       0.41      0.46      0.41     18297



In [12]:
# Model 3: Good Quality Prediction Next Year
classifier_quality = RandomForestClassifier(random_state=42)
classifier_quality.fit(X_train_quality, y_train_quality)
y_pred_quality = classifier_quality.predict(X_test_quality)
print("Good Quality Prediction Next Year - Accuracy:", accuracy_score(y_test_quality, y_pred_quality))
print("Good Quality Prediction Next Year - Report:\n", classification_report(y_test_quality, y_pred_quality))

Good Quality Prediction Next Year - Accuracy: 0.6951433220678432
Good Quality Prediction Next Year - Report:
               precision    recall  f1-score   support

         0.0       0.69      0.97      0.81     10501
         1.0       0.73      0.18      0.28      5477

    accuracy                           0.70     15978
   macro avg       0.71      0.57      0.54     15978
weighted avg       0.70      0.70      0.63     15978

