# DATA SCIENCE TECHNOLOGY AND SYSTEM
# Assignment – 1
### Predictive Modelling of Eating-Out problem
Student name: Kay Huynh
Student ID: u3245926

### Part B – Predictive Modelling

#### I. Feature Engineering:

In [None]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

##### 1. Perform data cleaning to remove/impute any records that are useless in the predictive task (such as NA, NaN, etc.)

In [None]:
# Read data
data= pd.read_csv('data\zomato_df_final_data.csv')
data.head()

In [None]:
#Deleting Unnnecessary Columns
data=data.drop(['link','color','phone','cuisine_color','address','lat','lng'],axis=1) 

In [None]:
# Cuisine column processing 
data['cuisine'] = data['cuisine'].str.split(', ')
restaurant_df = data.explode("cuisine").reset_index(drop=True)
restaurant_df['cuisine'] = restaurant_df['cuisine'].astype(str)
restaurant_df['cuisine'] = restaurant_df['cuisine'].str.replace('[', '')
restaurant_df['cuisine'] = restaurant_df['cuisine'].str.replace("'", '')
restaurant_df['cuisine'] = restaurant_df['cuisine'].str.replace("]", '')

In [None]:
# type column processing 
restaurant_df['type'] = restaurant_df ['type'].str.split(', ')
restaurant_df = restaurant_df.explode("type").reset_index(drop=True)
restaurant_df['type'] = restaurant_df['type'].astype(str)
restaurant_df['type'] = restaurant_df['type'].str.replace('[', '')
restaurant_df['type'] = restaurant_df['type'].str.replace("'", '')
restaurant_df['type'] = restaurant_df['type'].str.replace("]", '')

##### Exploratory data analysis

In [None]:
# data shape
print(restaurant_df.shape)
# column names
print(restaurant_df.columns)

In [None]:
# summary of dataset
print(restaurant_df.info())

In [None]:
# summary of missing value in dataset
count_missing_df = restaurant_df.isna().sum()
count_missing_df

In [None]:
# check if the target variable has any na/null values
restaurant_df = restaurant_df[restaurant_df['rating_number'].notna()]
print(restaurant_df.shape)

In [None]:
# Explore categorical variables
# types of variables
# categrical variables
categorical = [var for var in restaurant_df.columns if restaurant_df[var].dtype=='O']
print("There are {} categorical variables\n".format(len(categorical)))
print("The categorical variables are: ", categorical)
restaurant_df[categorical].head()

In [None]:
# get the frequency counts of the categorical variables
for var in categorical:
    print(restaurant_df[var].value_counts())

In [None]:
# get the percentages in each of the categorical variables
for var in categorical:
    print(restaurant_df[var].value_counts() / restaurant_df.shape[0])

In [None]:
# a function to explore each of the categorical variables
def explore_categorical(df, var):
    # check if the variable has any missing values
    print('********** missing values **********')
    print(df[var].isnull().sum())
    print('********** Labels **********')
    # check unique lables in variable
    print(df[var].unique())
    print('********** frequency **********')
    # check frequency of each variable
    print(df[var].value_counts())

In [None]:
explore_categorical(restaurant_df, 'cuisine')

In [None]:
explore_categorical(restaurant_df, 'rating_text')

In [None]:
explore_categorical(restaurant_df, 'subzone')

In [None]:
explore_categorical(restaurant_df, 'type')

In [None]:
explore_categorical(restaurant_df, 'title')

##### Explore Numerical Variables

In [None]:
# Find numerical variables
numericals = [var for var in restaurant_df.columns if restaurant_df[var].dtype != 'O']
print('There are {} numerical variables\n'.format(len(numericals)))
print('The numerical variables are :', numericals)
restaurant_df[numericals].head()

In [None]:
# Explore problems within numerical variables
# Check missing values in numerical variables
restaurant_df[numericals].isnull().sum()

In [None]:
# view summary statistics in numerical variables
print(round(restaurant_df[numericals].describe()), 2)

In [None]:
# Let's draw boxplots to visualise outliers in these variables
plt.figure(figsize=(15, 10))
plt.subplot(2,2,1)
fig = restaurant_df.boxplot(column= 'cost')
fig.set_title('')
fig.set_ylabel('Cost')

plt.subplot(2,2,2)
fig = restaurant_df.boxplot(column= 'rating_number')
fig.set_title('')
fig.set_ylabel('Rating')

plt.subplot(2,2,3)
fig = restaurant_df.boxplot(column= 'votes')
fig.set_title('')
fig.set_ylabel('Votes')


In [None]:
# plot histogram to check distribution
y = restaurant_df['rating_number']
plt.figure(figsize=(15,10))


plt.subplot(2, 1, 1)
fig = restaurant_df['cost'].hist(bins=50)
fig.set_xlabel('Cost')
fig.set_ylabel('Rating')


plt.subplot(2, 1, 2)
fig = restaurant_df["votes"].hist(bins=50)
fig.set_xlabel('Votes')
fig.set_ylabel('Rating')


In [None]:
# Find aoutliers in these variables
def find_outliers(variable, factor= 3, print_summary=True):
    IQR = restaurant_df[variable].quantile(0.75) - restaurant_df[variable].quantile(0.25)
    Lower_boundary = restaurant_df[variable].quantile(0.25) - (IQR * factor)
    Upper_boundary = restaurant_df[variable].quantile(0.75) + (IQR * factor)
    
    outliers= []
    for index, val in enumerate(restaurant_df[variable]):
        if val < Lower_boundary or val > Upper_boundary:
            outliers.append(index)
    
    
    if(print_summary):
        print('{variable} outliers are values < {lowerboundary} or > {upperboundary}'.format(variable= variable, lowerboundary=Lower_boundary, upperboundary=Upper_boundary))
    return Lower_boundary, Upper_boundary, outliers

In [None]:

_,_,_ = find_outliers('cost')

In [None]:

_,_,_ = find_outliers('rating_number')

In [None]:

_,_,_ = find_outliers('votes')

#### Feature engineering

In [None]:
restaurant_df.dtypes

In [None]:
# display categorical variables
categorical = [var for var in restaurant_df.columns if restaurant_df[var].dtypes == 'O']
categorical

In [None]:
# display numerical variables
numericals = [var for var in restaurant_df.columns if restaurant_df[var].dtypes != 'O']
numericals

#### Engineering missing values in numerical variables

In [None]:
# display missing values
restaurant_df[numericals].isnull().sum()

In [None]:
# percentage of missing values in each variable
round(restaurant_df[numericals].isnull().mean(), 2)

In [None]:
# Impute the missing values with the median values -- median is robust with the outliers
for df_temp in [restaurant_df]:
    for col in numericals:
        col_median = restaurant_df[col].median() # get it only from training
        df_temp[col].fillna(col_median, inplace=True)

In [None]:
# check again missing values in numerical variables in X_train
restaurant_df[numericals].isnull().sum()

#### Engineering missing values in categorical variables

In [None]:
round(restaurant_df[categorical].isnull().mean(), 2)

In [None]:
# impute missing categorical variables with most frequent value (i.e., mode)
for df_temp in [restaurant_df]:
    for col in categorical:
        col_mode = restaurant_df[col].mode()[0] # get it only from training
        df_temp[col].fillna(col_mode, inplace=True)

In [None]:
# check missing values in categorical variables in data
restaurant_df[categorical].isnull().sum()

#### Engineering outliers in numerical variables

In [None]:
# Replace the outliers with some predefined the maximum value for each variable
def max_value(df_temp, variable, top):
    return np.where(df_temp[variable]>top, top, df_temp[variable])

cols_with_outliers = {'cost': 160, 
                      'votes': 377
                     }
for df_temp in [restaurant_df]:
    for col in cols_with_outliers:
        df_temp[col] = max_value(df_temp, col, cols_with_outliers[col])

In [None]:
restaurant_df.cost.max()

In [None]:
restaurant_df.votes.max()

In [None]:
# we can also use seaborn library to plot elegant ones
df_custom = restaurant_df[['cost', 'votes']]
plt.figure(figsize=(15,10))
ax = sns.boxplot(data=df_custom, orient="h", palette="Set2")


##### 2. Use proper label/feature encoding for each feature/column you consider making the data ready for the modelling step

#### Encoding the categorical variables

In [None]:
restaurant_df[categorical].head()

In [None]:
#Encode the input Variables
def Encode(restaurant_df):
    for column in restaurant_df.columns[~restaurant_df.columns.isin(['cost', 'votes'])]:
        restaurant_df[column] = restaurant_df[column].factorize()[0]
    return restaurant_df

df_en = Encode(restaurant_df.copy())

In [None]:
df_en

In [None]:
#Get Correlation between different variables
corr = df_en.corr(method='kendall')
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)
df_en.columns

#### II. Regression:

##### 3. Build a linear regression model (model_regression_1) to predict the restaurants rating (numeric rating) from other features (columns) in the dataset. Please consider splitting the data into train (80%) and test (20%) sets.
[Hint: please use sklearn.model_selection.train_test_split and set random_state=0 “
while splitting]

##### Declare source and target variables

In [None]:
X = df_en.drop(['rating_number'], axis=1)
y = df_en['rating_number']

In [None]:
X

In [None]:
y

#### Split data into separate training and test set

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

##### Model training using logistic regression

In [None]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression

# instantiate the model
model_LR = LogisticRegression(solver='liblinear', random_state=0)


# fit the model
model_LR.fit(X_train, y_train)

In [None]:
# Predict results
y_pred_LR = model_LR.predict(X_test)

y_pred_LR

In [None]:
# Check accuracy score
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_LR)))

##### 4. Build another linear regression model (model_regression_2) with using the Gradient Descent as the optimisation function

In [None]:
# Data for inear regression model  Gradient Descent as the optimisation function
# X = df_en.drop([['rating_number','cuisine','groupon', 'cost_2',"title", 'rating_text','votes','cost','subzone']].value()
X = df_en[['rating_text','votes','cost','subzone','type']].values
y = df_en['rating_number'].values

In [None]:
test_size = 0.2
X_train_GD, X_test_GD, y_train_GD, y_test_GD = train_test_split(X, y, test_size = test_size, random_state=0)

In [None]:
# Add a column of ones to the feature matrix for the bias term
X_train_GD_b = np.c_[np.ones((X_train_GD.shape[0], 1)), X_train_GD]

In [None]:
class LinearRegressionGD:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None

    def fit(self, X, y):
        # Initialize weights
        self.weights = np.random.randn(X.shape[1])

        # Perform gradient descent
        for _ in range(self.n_iterations):
            # Compute predictions
            predictions = np.dot(X, self.weights)

            # Compute errors
            errors = predictions - y

            # Update weights using gradients
            gradient_weights = (1 / len(X)) * np.dot(X.T, errors)

            self.weights -= self.learning_rate * gradient_weights

    def predict(self, X):
        return np.dot(X, self.weights)


In [None]:
# Create a LinearRegressionGD instance
model = LinearRegressionGD(learning_rate=0.01, n_iterations=1000)

# Fit the model
model.fit(X_train_GD_b, y_train_GD)


In [None]:
# Generate predictions on the test data
X_test_GD_b = np.c_[np.ones((X_test.shape[0], 1)), X_test_GD]
y_pred_GDLR = model.predict(X_test_b)

In [None]:
y_pred_GDLR

In [None]:
X_train_GD_b

In [None]:
y_train_GD

In [None]:
X_test_GD_b

In [None]:
y_pred_GDLR

In [None]:
# Visualize the results (for a single feature)
plt.scatter(X_test_GD[:, 0], y_test_GD, label="Original Data")  # Replace 0 with the appropriate feature index
plt.plot(X_test_GD[:, 0], y_pred_GDLR, 'r-', label="Regression Line", linewidth=2)  # Replace 0 with the same feature index
plt.xlabel("Feature 1")
plt.ylabel("Target")
plt.legend()
plt.show()

In [None]:
# Visualize the results (for a single feature)
plt.scatter(X_test_GD[:, 1], y_test_GD, label="Original Data")  # Replace 0 with the appropriate feature index
plt.plot(X_test_GD[:, 1], y_pred_GDLR, 'r-', label="Regression Line", linewidth=2)  # Replace 0 with the same feature index
plt.xlabel("Feature 2")
plt.ylabel("Target")
plt.legend()
plt.show()

In [None]:
# Visualize the results (for a single feature)
plt.scatter(X_test_GD[:, 2], y_test_GD, label="Original Data")  # Replace 0 with the appropriate feature index
plt.plot(X_test_GD[:, 2], y_pred_GDLR, 'r-', label="Regression Line", linewidth=2)  # Replace 0 with the same feature index
plt.xlabel("Feature 3")
plt.ylabel("Target")
plt.legend()
plt.show()

In [None]:
# Visualize the results (for a single feature)
plt.scatter(X_test_GD[:, 3], y_test_GD, label="Original Data")  # Replace 0 with the appropriate feature index
plt.plot(X_test_GD[:, 3], y_pred_GDLR, 'r-', label="Regression Line", linewidth=2)  # Replace 0 with the same feature index
plt.xlabel("Feature 4")
plt.ylabel("Target")
plt.legend()
plt.show()

##### 5. Report the mean square error (MSE) on the test data for both models.

In [None]:
from sklearn.metrics import mean_squared_error
# Calculate the Mean Squared Error (MSE) between predicted and actual values for Regression model
mse_LR = mean_squared_error(y_test, y_pred_LR)
mse_LR


In [None]:
mse_GDLR = mean_squared_error(y_test_GD, y_pred_GDLR)
mse_GDLR

#### III. Classification:

##### 6. Simplify the problem into binary classifications where class 1 contains ‘Poor’ and ‘Average’ records while class 2 contains ‘Good’, ‘Very Good’ and ‘Excellent’ records

In [None]:
# Define a mapping for the ratings
rating_mapping = {
    'Poor': 1,
    'Average': 1,
    'Good': 2,
    'Very Good': 2,
    'Excellent': 2
}

In [None]:
# Create a new column 'binary_rating' based on the mapping
restaurant_df['binary_rating'] = restaurant_df['rating_text'].map(rating_mapping)

In [None]:
restaurant_df

##### 7. Build a logistic regression model (model_classification_3) for the simplified data, where training data is 80% and the test data is 20%.
[Hint: please use sklearn.model_selection.train_test_split and set random_state=0 “
while splitting]

In [None]:
#Encode the input Variables
def Encode(restaurant_df):
    for column in restaurant_df.columns[~restaurant_df.columns.isin(['cost', 'votes'])]:
        restaurant_df[column] = restaurant_df[column].factorize()[0]
    return restaurant_df

df_en = Encode(restaurant_df.copy())

In [None]:
X = df_en.drop(['binary_rating'], axis=1)
y = df_en['binary_rating']

In [None]:
X 

In [None]:
y

In [None]:
# split X and y into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# instantiate the model
model_classification_3 = LogisticRegression(solver='liblinear', random_state=0)


# fit the model
model_classification_3.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred_3 = model_classification_3.predict(X_test)

In [None]:
# Evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred_3)
classification_rep = classification_report(y_test, y_pred_3)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

##### 8. Use the confusion matrix to report the results of using the classification model on the test data.

In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_3)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

##### 9. Draw your conclusions and observations about the performance of the model relevant to the classes’ distributions.

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
cm_normalised = cm.astype('float32') / cm.sum(axis=1)[:, np.newaxis]

# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm_normalised, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='.2f', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_3))

##### Bonus: Repeat the previous classification task using three other models of your choice and report the performance.

##### Decision Tree Regression

In [None]:
# Decision Tree Regression model
# Import package
from sklearn.tree import DecisionTreeRegressor

# Build model
model_DecisionTree = DecisionTreeRegressor(min_samples_leaf=.0001)

# Fit model on train data
model_DecisionTree.fit(X_train,y_train)

# Predict test data
y_pred_DT = model_DecisionTree.predict(X_test)

# Print accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_DT)))
print("Classification Report:\n", classification_report(y_test, y_pred_3))

##### Gaussian Naive Bayes (GaussianNB)

In [None]:
# Import package
from sklearn.naive_bayes import GaussianNB
# Build model
model_GNB = GaussianNB()

# Fit model on train data
model_GNB.fit(X_train, y_train)

# Predict test data
y_pred_GNB = model_GNB.predict(X_test)

# Print accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_GNB)))
print("Classification Report:\n", classification_report(y_test, y_pred_GNB))

##### MLPClassifier

In [None]:
# Import package
from sklearn.neural_network import MLPClassifier

# Build model
model_MLP = MLPClassifier(random_state=1, max_iter=500)

# Fit model on train data
model_MLP.fit(X_train, y_train)

# Predict test data
y_pred_MLP = model_MLP.predict(X_test)

# Print accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_MLP)))
print("Classification Report:\n", classification_report(y_test, y_pred_MLP))

In [None]:
import joblib
# Save trained model
joblib.dump(model_LR, 'model_LR.pkl')
# joblib.dump(model_GDLR, 'model_GDLR.pkl')
joblib.dump(model_DecisionTree, 'model_DecisionTree.pkl')
joblib.dump(model_GNB, 'model_GNB.pkl')
joblib.dump(model_MLP, 'model_MLP.pkl')
joblib.dump(model_classification_3, 'model_classification_3.pkl')