# **Predictive Modeling**

Joseph Lardie

November 2023

# **Imports**

In [1]:
#Numpy
import numpy as np

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly
%matplotlib inline

# Standard imports
import re

#sklearn
from sklearn import preprocessing
from sklearn import set_config
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

# warnings
import warnings
warnings.filterwarnings("ignore")
set_config(display = 'diagram')

# **Loading Data**

In [2]:
# Loading NYC Restaraunt Inspections Data
rdf = pd.read_csv('rdf')

In [3]:
# Glance of Inspections Data
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210611 entries, 0 to 210610
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  210611 non-null  int64  
 1   DBA                    209970 non-null  object 
 2   BORO                   210611 non-null  object 
 3   BUILDING               210308 non-null  object 
 4   STREET                 210607 non-null  object 
 5   ZIPCODE                207890 non-null  float64
 6   CUISINE DESCRIPTION    208186 non-null  object 
 7   INSPECTION DATE        210611 non-null  object 
 8   ACTION                 208186 non-null  object 
 9   VIOLATION CODE         207055 non-null  object 
 10  VIOLATION DESCRIPTION  207055 non-null  object 
 11  CRITICAL FLAG          210611 non-null  object 
 12  SCORE                  200542 non-null  float64
 13  GRADE                  103135 non-null  object 
 14  GRADE DATE             94324 non-nul

# **Data Cleaning**

In [4]:
# Dropping irrelevant columns that do not affect predicting the classification question of will a restaurant pass an inspection
rdf.drop(['Latitude', 'Longitude', 'Council District', 'ZIPCODE', 'BUILDING', 'STREET', 'VIOLATION DESCRIPTION', 'VIOLATION CODE', 'CRITICAL FLAG', 'ACTION', 'GRADE'], axis=1, inplace=True)

In [5]:
# Inspection data after dropping irrelevant columns
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210611 entries, 0 to 210610
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CAMIS                210611 non-null  int64  
 1   DBA                  209970 non-null  object 
 2   BORO                 210611 non-null  object 
 3   CUISINE DESCRIPTION  208186 non-null  object 
 4   INSPECTION DATE      210611 non-null  object 
 5   SCORE                200542 non-null  float64
 6   GRADE DATE           94324 non-null   object 
 7   INSPECTION TYPE      208186 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 12.9+ MB


In [6]:
# Filter to eliminate restaurants that have not yet recieved their inspection
def filter_inspected_restaurants(rdf):
    # Filter out rows with '1/1/1900' date
    filtered_dataframe = rdf[rdf['INSPECTION DATE'] != '1900-01-01']

    return filtered_dataframe
# Applying the filter function to the column Inpection date
rdf = filter_inspected_restaurants(rdf)

In [7]:
# Display the number of duplicate rows in the dataset
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 140333 duplicate rows.


In [8]:
#Dropping duplicated Rows
rdf = rdf.drop_duplicates()

In [9]:
# Comfirming there are no more duplicates 
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


In [10]:
#Checking for missing values
rdf[rdf.isna().any(axis=1)].shape

(31613, 8)

In [11]:
#Identifying the columns containing missing values
rdf.isna().sum()

CAMIS                      0
DBA                      641
BORO                       0
CUISINE DESCRIPTION     2425
INSPECTION DATE            0
SCORE                   8636
GRADE DATE             31610
INSPECTION TYPE         2425
dtype: int64

In [12]:
# Function to clean restaurant names
def clean_restaurant_name(name):
    # Check if the input is a string
    if isinstance(name, str):
        # Remove store numbers using regular expression
        cleaned_name = re.sub(r'#\d+', '', name)
        
        # Remove all numeric characters
        cleaned_name = re.sub(r'\d+', '', cleaned_name)
    
        # Remove leading and trailing whitespace
        cleaned_name = cleaned_name.strip()
    
        return cleaned_name
    else:
        return name  # Return the input unchanged for non-string values

# Apply the clean_restaurant_name function to the 'DBA' column
rdf['Cleaned Restaurant Name'] = rdf['DBA'].apply(clean_restaurant_name)

# Drop the original 'DBA' column if you want
rdf.drop(columns=['DBA'], inplace=True)

In [13]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70278 entries, 0 to 210588
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CAMIS                    70278 non-null  int64  
 1   BORO                     70278 non-null  object 
 2   CUISINE DESCRIPTION      67853 non-null  object 
 3   INSPECTION DATE          70278 non-null  object 
 4   SCORE                    61642 non-null  float64
 5   GRADE DATE               38668 non-null  object 
 6   INSPECTION TYPE          67853 non-null  object 
 7   Cleaned Restaurant Name  69637 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 4.8+ MB


In [14]:
# Filtering out only the initial inspections and re-inspections
def filter_inspection_type(rdf):
    """
    Filter rows based on the values in the "INSPECTION TYPE" column.

    Returns:
    - DataFrame containing only rows with specified inspection types
    """
    inspection_types_to_keep = ['Cycle Inspection / Initial Inspection', 'Cycle Inspection / Re-inspection']
    filtered_rdf = rdf.loc[rdf['INSPECTION TYPE'].isin(inspection_types_to_keep)]

    return filtered_rdf

rdf = filter_inspection_type(rdf)

In [15]:
# Function for value counts of the columns 
for col in rdf.columns:
    print(rdf[col].value_counts())

40398688    13
50042800    11
50044250    11
40879245    10
41087273     9
            ..
41718160     1
50115103     1
50105346     1
41362553     1
40799593     1
Name: CAMIS, Length: 20406, dtype: int64
Manhattan        17621
Brooklyn         12461
Queens           10897
Bronx             4228
Staten Island     1781
Name: BORO, dtype: int64
American                    9392
Chinese                     3958
Coffee/Tea                  3479
Pizza                       2940
Bakery Products/Desserts    1814
                            ... 
Chilean                        4
Fruits/Vegetables              4
Basque                         3
Iranian                        2
Haute Cuisine                  2
Name: CUISINE DESCRIPTION, Length: 87, dtype: int64
03/16/2023    144
04/13/2023    137
02/16/2023    136
02/15/2023    136
06/06/2023    132
             ... 
06/01/2017      1
06/15/2018      1
12/17/2016      1
05/29/2016      1
02/21/2017      1
Name: INSPECTION DATE, Length: 1498, dtyp

## **Selecting Brooklyn Restaurants**

In [16]:
# Selecting subset of only restaurants from Brooklyn
rdf = rdf[rdf['BORO'].str.lower() == 'brooklyn']

In [17]:
# Handling missing values and date columns
rdf['INSPECTION DATE'] = pd.to_datetime(rdf['INSPECTION DATE'], errors='coerce')
rdf['year'] = rdf['INSPECTION DATE'].dt.year
rdf['month'] = rdf['INSPECTION DATE'].dt.month
rdf['day'] = rdf['INSPECTION DATE'].dt.day

In [18]:
# Dropping 'BORO' and 'CUISINE DESCRIPTION' now that only restaurants in Brooklyn remain
rdf.drop(['BORO', 'CUISINE DESCRIPTION', 'INSPECTION DATE', 'GRADE DATE', 'INSPECTION TYPE'], axis=1, inplace=True)

In [19]:
# Looking at the remaining data after cleaning
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12461 entries, 41 to 210374
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   CAMIS                    12461 non-null  int64  
 1   SCORE                    12461 non-null  float64
 2   Cleaned Restaurant Name  12461 non-null  object 
 3   year                     12461 non-null  int64  
 4   month                    12461 non-null  int64  
 5   day                      12461 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 681.5+ KB


# **Validation Split**

In [20]:
# Defining the target variable
X= rdf.drop('SCORE', axis=1)
y= rdf['SCORE']

# **Data Preprocessing**

In [21]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Imputer
imputer = SimpleImputer(strategy='mean')
# Standard Scaler
scaler = StandardScaler()
# Instantiate the selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
# Forming pipelines
num_pipe = make_pipeline(imputer, scaler)
cat_pipe = make_pipeline(ohe)

#Create Tuples
num_tuple = ('num', num_pipe, num_selector)
cat_tuple = ('cat', cat_pipe, cat_selector)

# Column Transformer to apply different transformers to different subsets of columns
preprocessor = ColumnTransformer(
    transformers=[num_tuple, cat_tuple]
)

# LogisticRegression as a transformer
lr_transformer = LogisticRegression()

# Final pipeline combining preprocessing and modeling
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lr_transformer', lr_transformer)
])

In [23]:
#Creating the pipeline
pipe = Pipeline([('preprocessor', preprocessor), ('lr_transformer', lr_transformer)])
pipe

In [24]:
#Fit on the Train
pipe.fit(X_train, y_train)

In [25]:
#Predictions
y_pred = pipe.predict(X_test)
y_pred

array([13., 12.,  2., ...,  9.,  2.,  2.])

In [26]:
#Accuracy score
accuracy_score(y_test,y_pred)

0.09787404733253109

In [27]:
# Check for missing values
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in y_train:", y_train.isnull().sum().sum())

# Check data types
print("Data types in X_train:\n", X_train.dtypes)
print("Data type in y_train:", y_train.dtypes)


Missing values in X_train: 0
Missing values in y_train: 0
Data types in X_train:
 CAMIS                       int64
Cleaned Restaurant Name    object
year                        int64
month                       int64
day                         int64
dtype: object
Data type in y_train: float64


In [28]:
# Hyperparameter Tuning using Grid Search
param_grid = {
    'rf_transformer__n_estimators': [50, 100, 200],
    'rf_transformer__max_depth': [2, 5, 15, 20],
    'rf_transformer__min_samples_split': [2, 5, 10],
    'rf_transformer__min_samples_leaf': [1, 2, 4]
}

lr_transformer = LogisticRegression(random_state=42)

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', error_score='raise')
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

best_lg_model = grid_search.best_estimator_
y_pred = best_lg_model.predict(X_test)

# Evaluate on Test Set
# Extract feature importances from the RandomForestClassifier within the pipeline
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_lr_model.named_steps['rf_transformer'].feature_importances_
})
print("\nFeature Importance:\n", feature_importance.sort_values(by='Importance', ascending=False))

# Display the accuracy and classification report
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Display feature importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_lr_model.feature_importances_})
print("\nFeature Importance:\n", feature_importance.sort_values(by='Importance', ascending=False))

ValueError: Invalid parameter 'rf_transformer' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000281E271CD60>),
                                                 ('cat',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000281E271C3A0>)])),
                ('lr_transformer', LogisticRegression())]). Valid parameters are: ['memory', 'steps', 'verbose'].

- Hyperparameter tuning using GridSearchCV (you may need more parameters) param_grid = {     'n_estimators': [100, 200],     'max_depth': [None, 10, 20],     'min_samples_split': [2, 5],     'min_samples_leaf': [1, 2] }  grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train)
- Get the best model from hyperparameter tuning best_model = grid_search.best_estimator_ 
- Train the best model best_model.fit(X_train, y_train)  # Make predictions on the test set y_pred = best_model.predict(X_test) 
- Evaluate the model accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy:.2f}')  
- Print classification report for more detailed evaluation print(classification_report(y_test, y_pred)) 
- Use the trained model to make predictions on new data new_data = pd.DataFrame({     'Feature1': [value1],     'Feature2': [value2],   
- Add more features as needed })  
- Preprocess new data similarly to training data (handle missing values, encoding, scaling) new_data.fillna(0, inplace=True) new_data[categorical_columns] = le.transform(new_data[categorical_columns]) new_data = scaler.transform(new_data)  predicted_grade = best_model.predict(new_data) print(f'Predicted Grade: {predicted_grade[0]}')