# **Predictive Modeling**

# **Imports**

In [1]:
#Numpy
import numpy as np

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly
%matplotlib inline

# Standard imports
import re


#sklearn
from sklearn import preprocessing
from sklearn import set_config
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# warnings
import warnings
warnings.filterwarnings("ignore")
set_config(display = 'diagram')

# **Loading Data**

In [2]:
# Loading NYC Restaraunt Data
rdf = pd.read_csv('rdf')

In [3]:
# Glance of inspection data
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209141 entries, 0 to 209140
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  209141 non-null  int64  
 1   DBA                    208544 non-null  object 
 2   BORO                   209141 non-null  object 
 3   BUILDING               208824 non-null  object 
 4   STREET                 209135 non-null  object 
 5   ZIPCODE                206455 non-null  float64
 6   CUISINE DESCRIPTION    206702 non-null  object 
 7   INSPECTION DATE        209141 non-null  object 
 8   ACTION                 206702 non-null  object 
 9   VIOLATION CODE         205553 non-null  object 
 10  VIOLATION DESCRIPTION  205553 non-null  object 
 11  CRITICAL FLAG          209141 non-null  object 
 12  SCORE                  199209 non-null  float64
 13  GRADE                  102462 non-null  object 
 14  GRADE DATE             93826 non-nul

# **Data Cleaning**

In [4]:
# Dropping irrelevant columns that do not affect predicting the classification question of will a restaurant pass an inspection
rdf.drop(['Latitude', 'Longitude', 'Council District', 'ZIPCODE', 'BUILDING', 'STREET', 'VIOLATION DESCRIPTION', 'VIOLATION CODE', 'CRITICAL FLAG', 'ACTION', 'GRADE'], axis=1, inplace=True)

In [5]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209141 entries, 0 to 209140
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CAMIS                209141 non-null  int64  
 1   DBA                  208544 non-null  object 
 2   BORO                 209141 non-null  object 
 3   CUISINE DESCRIPTION  206702 non-null  object 
 4   INSPECTION DATE      209141 non-null  object 
 5   SCORE                199209 non-null  float64
 6   GRADE DATE           93826 non-null   object 
 7   INSPECTION TYPE      206702 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 12.8+ MB


In [6]:
# Convert the 'GRADE DATE' and ' 'INSPECTION DATE' columns to datetime type
#rdf['INSPECTION DATE'] = pd.to_datetime(rdf['INSPECTION DATE'])
#rdf['GRADE DATE'] = pd.to_datetime(rdf['GRADE DATE'])

In [7]:
# Filter to eliminate restaurants that have not yet recieved their inspection
def filter_inspected_restaurants(rdf):
    # Filter out rows with '1/1/1900' date
    filtered_dataframe = rdf[rdf['INSPECTION DATE'] != '1900-01-01']

    return filtered_dataframe
# Applying the filter function to the column Inpection date
rdf = filter_inspected_restaurants(rdf)

In [8]:
# Display the number of duplicate rows in the dataset
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 139254 duplicate rows.


In [9]:
#Dropping duplicated Rows
rdf = rdf.drop_duplicates()

In [10]:
# Comfirming there are no more duplicates 
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


In [11]:
#Checking for missing values
rdf[rdf.isna().any(axis=1)].shape

(31379, 8)

In [12]:
#Identifying the columns containing missing values
rdf.isna().sum()

CAMIS                      0
DBA                      597
BORO                       0
CUISINE DESCRIPTION     2439
INSPECTION DATE            0
SCORE                   8546
GRADE DATE             31376
INSPECTION TYPE         2439
dtype: int64

In [13]:
# checking missing data in data 
def check_missing_data(rdf):
    total = rdf.isnull().sum().sort_values(ascending = False)
    percent = (rdf.isnull().sum()/rdf.isnull().count()*100).sort_values(ascending = False)
    percent = percent.round(2).astype(str) + '%'
    missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_data_result = check_missing_data(rdf)
missing_data_result.head()

Unnamed: 0,Total,Percent
GRADE DATE,31376,44.9%
SCORE,8546,12.23%
CUISINE DESCRIPTION,2439,3.49%
INSPECTION TYPE,2439,3.49%
DBA,597,0.85%


In [14]:
# Function to clean restaurant names
def clean_restaurant_name(name):
    # Check if the input is a string
    if isinstance(name, str):
        # Remove store numbers using regular expression
        cleaned_name = re.sub(r'#\d+', '', name)
        
        # Remove all numeric characters
        cleaned_name = re.sub(r'\d+', '', cleaned_name)
    
        # Remove leading and trailing whitespace
        cleaned_name = cleaned_name.strip()
    
        return cleaned_name
    else:
        return name  # Return the input unchanged for non-string values

# Apply the clean_restaurant_name function to the 'DBA' column
rdf['Cleaned Restaurant Name'] = rdf['DBA'].apply(clean_restaurant_name)

# Drop the original 'DBA' column if you want
rdf.drop(columns=['DBA'], inplace=True)

In [15]:
def eliminate_missing_grade_date_rows(rdf):
    """
    Eliminate rows with missing data in the "GRADE DATE" column.

    Returns:
    - Cleaned DataFrame without rows with missing "GRADE DATE"
    """
    # Drop rows with missing values in the "GRADE DATE" column
    cleaned_dataframe = rdf.dropna(subset=["GRADE DATE"])

    return cleaned_dataframe

rdf = eliminate_missing_grade_date_rows(rdf)

In [16]:
# Filtering out only the initial inspections and re-inspections
def filter_inspection_type(rdf):
    """
    Filter rows based on the values in the "INSPECTION TYPE" column.

    Returns:
    - DataFrame containing only rows with specified inspection types
    """
    inspection_types_to_keep = ['Cycle Inspection / Initial Inspection', 'Cycle Inspection / Re-inspection']
    filtered_rdf = rdf.loc[rdf['INSPECTION TYPE'].isin(inspection_types_to_keep)]

    return filtered_rdf

rdf = filter_inspection_type(rdf)

In [17]:
# Function for value counts of the columns 
for col in rdf.columns:
    print(rdf[col].value_counts())

50001249    5
40879245    5
50044250    5
50035292    5
40717234    5
           ..
50032876    1
50079992    1
50110649    1
50085970    1
50070120    1
Name: CAMIS, Length: 18859, dtype: int64
Manhattan        11620
Brooklyn          8002
Queens            7201
Bronx             2705
Staten Island     1163
Name: BORO, dtype: int64
American                    6631
Coffee/Tea                  2515
Chinese                     2289
Pizza                       1855
Bakery Products/Desserts    1184
                            ... 
Californian                    2
Basque                         2
Chilean                        2
Iranian                        1
Lebanese                       1
Name: CUISINE DESCRIPTION, Length: 87, dtype: int64
03/03/2022    105
02/28/2023    104
12/01/2022    103
02/27/2023    103
01/24/2022    103
             ... 
05/27/2023      1
06/29/2016      1
05/28/2016      1
09/20/2021      1
02/27/2018      1
Name: INSPECTION DATE, Length: 1387, dtype: int64
12

In [18]:
# Selecting subset of only restaurants from Brooklyn
rdf = rdf[rdf['BORO'].str.lower() == 'brooklyn']

In [19]:
# Selecting subset of only restaurants in Brooklyn that have pizza in the cuisine description
rdf = rdf[(rdf['CUISINE DESCRIPTION'].str.lower() == 'pizza')]

In [20]:
# Example: Handling missing values and date columns
rdf['INSPECTION DATE'] = pd.to_datetime(rdf['INSPECTION DATE'], errors='coerce')
rdf['year'] = rdf['INSPECTION DATE'].dt.year
rdf['month'] = rdf['INSPECTION DATE'].dt.month
rdf['day'] = rdf['INSPECTION DATE'].dt.day

In [21]:
# Dropping 'BORO' and 'CUISINE DESCRIPTION' now that only the pizza restaurants in Brooklyn remain
rdf.drop(['BORO', 'CUISINE DESCRIPTION', 'INSPECTION DATE', 'GRADE DATE', 'INSPECTION TYPE'], axis=1, inplace=True)

In [22]:
#rdf = pd.get_dummies(rdf.drop(['date_column'], axis=1), columns=['Cleaned Restaurant Name'])

In [23]:
# Looking at the remaining data after cleaning
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528 entries, 143 to 208403
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CAMIS                    528 non-null    int64  
 1   SCORE                    528 non-null    float64
 2   Cleaned Restaurant Name  528 non-null    object 
 3   year                     528 non-null    int64  
 4   month                    528 non-null    int64  
 5   day                      528 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 28.9+ KB


# **Validation Split**

In [24]:
# Defining the target variable
X= rdf.drop('SCORE', axis=1)
y= rdf['SCORE']

In [25]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Data Preprocessing**

In [26]:
# OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Standard Scaler
scaler = StandardScaler()
# Imputer
imputer = SimpleImputer(strategy='mean')
# Instantiate the selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
# Forming pipelines
num_pipe = make_pipeline(imputer, scaler)
cat_pipe = make_pipeline(ohe)
# Column Transformer to apply different transformers to different subsets of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_selector),
        ('cat', cat_pipe, cat_selector)
    ]
)

# Random Forest Classifier as a transformer
rf_transformer = RandomForestClassifier()

# Final pipeline combining preprocessing and modeling
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf_transformer', rf_transformer)
])

# Now you can use the pipeline for fitting and predicting
# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)

In [27]:
#Create Tuples
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

In [28]:
#Instantiate preprocessor Transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, make_column_selector(dtype_include='number')),
        ('cat', ohe, make_column_selector(dtype_include='object'))
    ]
)

In [29]:
#Creating the pipeline
pipe = Pipeline([('preprocessor', preprocessor), ('rf_transformer', rf_transformer)])
pipe

In [30]:
#Fit on the Train
pipe.fit(X_train, y_train)

In [31]:
#Predictions
y_pred = pipe.predict(X_test)
y_pred

array([12., 12.,  4., 12.,  9., 11., 12., 13., 12., 10., 12., 12., 12.,
       10., 12., 12., 13.,  7., 10., 13., 13., 12., 27., 12., 11.,  7.,
       13.,  9., 12., 13., 13., 10.,  7., 11., 12., 10., 13., 12., 12.,
       12.,  9., 13., 12., 12.,  9., 12.,  7., 13., 12., 13., 12., 12.,
       12., 13., 12., 12., 13., 13., 10.,  3., 13., 12., 12., 12., 12.,
       12., 12., 10., 12., 13., 12., 12.,  0., 12.,  3., 12., 12., 12.,
        2., 12., 13.,  4.,  9., 17., 12.,  9.,  4., 10., 12., 12., 12.,
       12., 12., 12.,  0., 12.,  7., 12., 13., 23., 13.,  3., 10., 13.,
       13., 12.])

In [32]:
#Accuracy score
accuracy_score(y_test,y_pred)

0.11320754716981132

In [33]:
# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Step 4: Evaluate on Test Set
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Display the accuracy and classification report
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 5: Feature Importance
# Display feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_rf_model.feature_importances_})
print("\nFeature Importance:\n", feature_importance.sort_values(by='Importance', ascending=False))

ValueError: 
All the 540 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\ensemble\_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
    X = check_array(
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: "DOMINO'S"

--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\ensemble\_forest.py", line 331, in fit
    X, y = self._validate_data(
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
    X = check_array(
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "C:\Users\davyd\anaconda3\envs\dojo-env\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: "JOE'S PIZZA"



- Load your dataset, replace 'your_data.csv' with your data file. data = pd.read_csv('your_data.csv') 
- Assuming your dataset has various features and 'GRADE' as the target variable. 
- Replace these with your actual feature and target column names. 
- Preprocess the data X = data.drop(columns=['GRADE'])  
- Features y = data['GRADE']  
- Target variable  
- Handle missing data (you may need more advanced methods) X.fillna(0, inplace=True)  
- Encode categorical variables (if needed) categorical_columns = X.select_dtypes(include=['object']).columns for col in categorical_columns:     le = LabelEncoder()     X[col] = le.fit_transform(X[col])  
- Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- Feature scaling (normalize numerical features) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) 
- Choose a classification model (Random Forest in this example) model = RandomForestClassifier(random_state=42) 
- Hyperparameter tuning using GridSearchCV (you may need more parameters) param_grid = {     'n_estimators': [100, 200],     'max_depth': [None, 10, 20],     'min_samples_split': [2, 5],     'min_samples_leaf': [1, 2] }  grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train)
- Get the best model from hyperparameter tuning best_model = grid_search.best_estimator_ 
- Train the best model best_model.fit(X_train, y_train)  # Make predictions on the test set y_pred = best_model.predict(X_test) 
- Evaluate the model accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy:.2f}')  
- Print classification report for more detailed evaluation print(classification_report(y_test, y_pred)) 
- Use the trained model to make predictions on new data new_data = pd.DataFrame({     'Feature1': [value1],     'Feature2': [value2],   
- Add more features as needed })  
- Preprocess new data similarly to training data (handle missing values, encoding, scaling) new_data.fillna(0, inplace=True) new_data[categorical_columns] = le.transform(new_data[categorical_columns]) new_data = scaler.transform(new_data)  predicted_grade = best_model.predict(new_data) print(f'Predicted Grade: {predicted_grade[0]}')