# **Predictive Modeling**

# **Imports**

In [1]:
#Numpy
import numpy as np

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly
%matplotlib inline

# Standard imports
import re


#sklearn
from sklearn import preprocessing
from sklearn import set_config
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# warnings
import warnings
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")
set_config(display = 'diagram')

# **Loading Data**

In [2]:
# Loading NYC Restaraunt Data
rdf = pd.read_csv('rdf')

In [3]:
# Glance of inspection data
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208616 entries, 0 to 208615
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  208616 non-null  int64  
 1   DBA                    208042 non-null  object 
 2   BORO                   208616 non-null  object 
 3   BUILDING               208270 non-null  object 
 4   STREET                 208609 non-null  object 
 5   ZIPCODE                205933 non-null  float64
 6   CUISINE DESCRIPTION    206203 non-null  object 
 7   INSPECTION DATE        208616 non-null  object 
 8   ACTION                 206203 non-null  object 
 9   VIOLATION CODE         205056 non-null  object 
 10  VIOLATION DESCRIPTION  205056 non-null  object 
 11  CRITICAL FLAG          208616 non-null  object 
 12  SCORE                  198750 non-null  float64
 13  GRADE                  102151 non-null  object 
 14  GRADE DATE             93552 non-nul

# **Data Cleaning**

In [4]:
# Dropping irrelevant columns that do not affect to predicting the classification question of will a restaurant pass an inspection
rdf.drop(['Latitude', 'Longitude', 'Council District', 'ZIPCODE', 'BUILDING', 'STREET'], axis=1, inplace=True)

In [5]:
# Convert the 'GRADE DATE' column to datetime type
rdf['INSPECTION DATE'] = pd.to_datetime(rdf['INSPECTION DATE'])
rdf['GRADE DATE'] = pd.to_datetime(rdf['GRADE DATE'])

In [6]:
# Filter to eliminate restaurants that have not yet recieved their inspection
def filter_inspected_restaurants(rdf):
    # Filter out rows with '1/1/1900' date
    filtered_dataframe = rdf[rdf['INSPECTION DATE'] != '1900-01-01']

    return filtered_dataframe
# Applying the filter function to the column Inpection date
rdf = filter_inspected_restaurants(rdf)

In [7]:
# Display the number of duplicate rows in the dataset
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 6 duplicate rows.


In [8]:
#Dropping duplicated Rows
rdf = rdf.drop_duplicates()

In [9]:
# Comfirming there are no more duplicates 
print(f'There are {rdf.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


In [10]:
#Checking for missing values
rdf[rdf.isna().any(axis=1)].shape

(113078, 13)

In [11]:
#Identifying the columns containing missing values
rdf.isna().sum()

CAMIS                         0
DBA                           4
BORO                          0
CUISINE DESCRIPTION           0
INSPECTION DATE               0
ACTION                        0
VIOLATION CODE             1147
VIOLATION DESCRIPTION      1147
CRITICAL FLAG                 0
SCORE                      7453
GRADE                    104046
GRADE DATE               112645
INSPECTION TYPE               0
dtype: int64

In [12]:
# checking missing data in data 
def check_missing_data(rdf):
    total = rdf.isnull().sum().sort_values(ascending = False)
    percent = (rdf.isnull().sum()/rdf.isnull().count()*100).sort_values(ascending = False)
    percent = percent.round(2).astype(str) + '%'
    missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
missing_data_result = check_missing_data(rdf)
missing_data_result.head()

Unnamed: 0,Total,Percent
GRADE DATE,112645,54.63%
GRADE,104046,50.46%
SCORE,7453,3.61%
VIOLATION CODE,1147,0.56%
VIOLATION DESCRIPTION,1147,0.56%


In [13]:
# Filtering out rows that are missing grade
def filter_missing_grade(rdf):
    filtered_dataframe = rdf.dropna(subset=['GRADE'])

    return filtered_dataframe
# Applying the filter function to the column Grade
rdf = filter_missing_grade(rdf)

In [14]:
# Function to clean restaurant names
def clean_restaurant_name(name):
    # Check if the input is a string
    if isinstance(name, str):
        # Remove store numbers using regular expression
        cleaned_name = re.sub(r'#\d+', '', name)
        
        # Remove all numeric characters
        cleaned_name = re.sub(r'\d+', '', cleaned_name)
    
        # Remove leading and trailing whitespace
        cleaned_name = cleaned_name.strip()
    
        return cleaned_name
    else:
        return name  # Return the input unchanged for non-string values

# Apply the clean_restaurant_name function to the 'DBA' column
rdf['Cleaned Restaurant Name'] = rdf['DBA'].apply(clean_restaurant_name)

# Drop the original 'DBA' column if you want
rdf.drop(columns=['DBA'], inplace=True)

In [15]:
# Function for value counts of the columns 
for col in rdf.columns:
    print(rdf[col].value_counts())

40365904    25
50045647    24
50111296    23
40398688    22
41658324    22
            ..
50035269     1
50044601     1
50127802     1
41416592     1
50044631     1
Name: CAMIS, Length: 25925, dtype: int64
Manhattan        38107
Brooklyn         26302
Queens           24834
Bronx             9007
Staten Island     3901
Name: BORO, dtype: int64
American                    18873
Chinese                      9026
Coffee/Tea                   7358
Pizza                        6078
Bakery Products/Desserts     3790
                            ...  
Czech                           9
Chilean                         8
Haute Cuisine                   5
Basque                          4
Chimichurri                     2
Name: CUISINE DESCRIPTION, Length: 89, dtype: int64
2022-12-01    367
2023-03-01    361
2022-11-28    356
2023-01-25    355
2023-02-28    352
             ... 
2019-01-09      1
2016-12-18      1
2017-02-02      1
2018-01-31      1
2018-12-12      1
Name: INSPECTION DATE, Length:

# **Validation Split**

In [16]:
# Defining the target variable
X= rdf.drop('SCORE', axis=1)
y= rdf['SCORE']

In [17]:
# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Data Preprocessing**

In [18]:
#OneHotEncoder
ohe = OneHotEncoder(sparse= False, handle_unknown='ignore')
#Standard Scaler
scaler = StandardScaler()
#Imputer
imputer= SimpleImputer(strategy='mean')
imputer
#Instantiate the selectors 
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')
#Forming pipelines
num_pipe = make_pipeline(scaler, imputer)
cat_pipe = make_pipeline(ohe)
#Instantiating the random forest classifier as transformer 1
transformer1 = RandomForestClassifier()
transformer1

In [19]:
#Create Tuples
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

In [20]:
#Instantiate preprocessor Transformers
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
preprocessor

In [21]:
#Creating the pipeline
pipe = Pipeline([('preprocessor', preprocessor), ('transformer1', transformer1)])
pipe

In [22]:
#Fit on the Train
pipe.fit(X_train, y_train)

TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[float64]'>, <class 'numpy.dtype[float64]'>, <class 'numpy.dtype[datetime64]'>)

In [None]:
#Predictions
y_pred = pipe.predict(X_test)
y_pred

In [None]:
#Accuracy score
accuracy_score(y_test,y_pred)


- Load your dataset, replace 'your_data.csv' with your data file. data = pd.read_csv('your_data.csv') 
- Assuming your dataset has various features and 'GRADE' as the target variable. 
- Replace these with your actual feature and target column names. 
- Preprocess the data X = data.drop(columns=['GRADE'])  
- Features y = data['GRADE']  
- Target variable  
- Handle missing data (you may need more advanced methods) X.fillna(0, inplace=True)  
- Encode categorical variables (if needed) categorical_columns = X.select_dtypes(include=['object']).columns for col in categorical_columns:     le = LabelEncoder()     X[col] = le.fit_transform(X[col])  
- Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- Feature scaling (normalize numerical features) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) 
- Choose a classification model (Random Forest in this example) model = RandomForestClassifier(random_state=42) 
- Hyperparameter tuning using GridSearchCV (you may need more parameters) param_grid = {     'n_estimators': [100, 200],     'max_depth': [None, 10, 20],     'min_samples_split': [2, 5],     'min_samples_leaf': [1, 2] }  grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train)
- Get the best model from hyperparameter tuning best_model = grid_search.best_estimator_ 
- Train the best model best_model.fit(X_train, y_train)  # Make predictions on the test set y_pred = best_model.predict(X_test) 
- Evaluate the model accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy:.2f}')  
- Print classification report for more detailed evaluation print(classification_report(y_test, y_pred)) 
- Use the trained model to make predictions on new data new_data = pd.DataFrame({     'Feature1': [value1],     'Feature2': [value2],   
- Add more features as needed })  
- Preprocess new data similarly to training data (handle missing values, encoding, scaling) new_data.fillna(0, inplace=True) new_data[categorical_columns] = le.transform(new_data[categorical_columns]) new_data = scaler.transform(new_data)  predicted_grade = best_model.predict(new_data) print(f'Predicted Grade: {predicted_grade[0]}')