In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report

ModuleNotFoundError: No module named 'pandas'

In [None]:

# Load data
file_path = 'path_to_your_file.csv'
restaurant_data = pd.read_csv(file_path)

# Step 1: Data Cleaning and Feature Engineering
# Drop irrelevant columns with high missing data or those not useful for prediction
columns_to_drop = [
    'business_phone_number', 'business_location', 'Neighborhoods',
    'SF Find Neighborhoods', 'Current Police Districts', 
    'Current Supervisor Districts', 'Analysis Neighborhoods'
]
cleaned_data = restaurant_data.drop(columns=columns_to_drop)

# Filter datasets based on non-missing target columns
score_data = cleaned_data.dropna(subset=['inspection_score'])
risk_data = cleaned_data.dropna(subset=['risk_category'])

# Separate LabelEncoders for each dataset to handle unique values
label_encoders_score = {}
label_encoders_risk = {}

for col in ['inspection_type', 'business_city', 'business_state']:
    # Encoder for inspection score dataset
    le_score = LabelEncoder()
    score_data[col] = le_score.fit_transform(score_data[col])
    label_encoders_score[col] = le_score

    # Encoder for risk classification dataset
    le_risk = LabelEncoder()
    risk_data[col] = le_risk.fit_transform(risk_data[col])
    label_encoders_risk[col] = le_risk

# Convert 'inspection_date' to datetime and extract year, month, and day
for dataset in [score_data, risk_data]:
    dataset['inspection_date'] = pd.to_datetime(dataset['inspection_date'], errors='coerce')
    dataset['inspection_year'] = dataset['inspection_date'].dt.year
    dataset['inspection_month'] = dataset['inspection_date'].dt.month
    dataset['inspection_day'] = dataset['inspection_date'].dt.day

# Step 2: Define Feature Sets and Split Data for Each Task

# Task 1: Inspection Score Prediction (Regression)
features_score = score_data.drop(columns=['inspection_score', 'inspection_date', 'risk_category', 'violation_id', 'violation_description'])
X_train_score, X_test_score, y_train_score, y_test_score = train_test_split(
    features_score, score_data['inspection_score'], test_size=0.2, random_state=42
)

# Task 2: Violation Risk Classification (Classification)
features_risk = risk_data.drop(columns=['risk_category', 'inspection_date', 'inspection_score', 'violation_id', 'violation_description'])
X_train_risk, X_test_risk, y_train_risk, y_test_risk = train_test_split(
    features_risk, risk_data['risk_category'], test_size=0.2, random_state=42
)

# Step 3: New Feature - Good Quality Prediction Next Year
# Define the good quality threshold
GOOD_QUALITY_THRESHOLD = 90

# Convert inspection_date to datetime (if not already done)
restaurant_data['inspection_date'] = pd.to_datetime(restaurant_data['inspection_date'], errors='coerce')

# Sort data by business_id and inspection_date
restaurant_data = restaurant_data.sort_values(by=['business_id', 'inspection_date'])

# Create a new column for good quality status based on inspection score threshold
restaurant_data['is_good_quality'] = np.where(restaurant_data['inspection_score'] >= GOOD_QUALITY_THRESHOLD, 1, 0)

# Shift the quality status by -1 year for each restaurant to create next year's quality prediction
restaurant_data['good_quality_next_year'] = restaurant_data.groupby('business_id')['is_good_quality'].shift(-1)
restaurant_data = restaurant_data.dropna(subset=['good_quality_next_year'])  # Drop rows where next year's data is not available

# Select relevant features and the new target
features_quality_prediction = restaurant_data.drop(columns=[
    'inspection_score', 'inspection_date', 'is_good_quality', 
    'violation_id', 'violation_description', 'good_quality_next_year'
])
target_quality_prediction = restaurant_data['good_quality_next_year']

# Split the data
X_train_quality, X_test_quality, y_train_quality, y_test_quality = train_test_split(
    features_quality_prediction, target_quality_prediction, test_size=0.2, random_state=42
)

# Step 4: Train and Evaluate Models

# Model 1: Inspection Score Prediction (Regression)
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_score, y_train_score)
y_pred_score = regressor.predict(X_test_score)
print("Inspection Score Prediction - MAE:", mean_absolute_error(y_test_score, y_pred_score))
print("Inspection Score Prediction - RMSE:", np.sqrt(mean_squared_error(y_test_score, y_pred_score)))

# Model 2: Violation Risk Classification
classifier_risk = RandomForestClassifier(random_state=42)
classifier_risk.fit(X_train_risk, y_train_risk)
y_pred_risk = classifier_risk.predict(X_test_risk)
print("Violation Risk Classification - Accuracy:", accuracy_score(y_test_risk, y_pred_risk))
print("Violation Risk Classification - Report:\n", classification_report(y_test_risk, y_pred_risk))

# Model 3: Good Quality Prediction Next Year
classifier_quality = RandomForestClassifier(random_state=42)
classifier_quality.fit(X_train_quality, y_train_quality)
y_pred_quality = classifier_quality.predict(X_test_quality)
print("Good Quality Prediction Next Year - Accuracy:", accuracy_score(y_test_quality, y_pred_quality))
print("Good Quality Prediction Next Year - Report:\n", classification_report(y_test_quality, y_pred_quality))