# Part1: Feature Engineering 
Data Cleaning and label/feature encoding

In [1]:
# Loading necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Loading the dataset
df_initial = pd.read_csv('zomato_df_final_data.csv')

# Looking into the shape of the dataset
print("The shape of the dataset:")
df_initial.shape

# Removing rows with NA/NaN values
df_clean = df_initial.dropna()

# Looking into the cleaned dataset
print("The shape of the cleaned dataset")
df_clean.shape


# Saving the cleaned dataset
df = df_clean

# Creating a mapping for rating_text to classes
rating_mapping = {
    'Poor': 1,
    'Average': 1,
    'Good': 2,
    'Very Good': 2,
    'Excellent': 2
}

# Mapping the rating_text to a new column 'rating_class'
df['rating_class'] = df['rating_text'].map(rating_mapping)

# Display of the unique values in the new column
print("Unique values in rating_class:", df['rating_class'].unique())

# Encoding categorical features(one-hot encoding)
df = df.join(df['cuisine'].str.join('|').str.get_dummies(), rsuffix='_cuisine')
df = df.join(df['type'].str.join('|').str.get_dummies(), rsuffix='_type')

# Dropping the original 'cuisine' and 'type' columns as binary encoding is performed
df.drop(['cuisine', 'type'], axis=1, inplace=True)

# Display of the shape of the prepared dataset
print("Shape of the prepared dataset:", df.shape)

# Display the first few rows of the prepared dataset
print("Prepared dataset:\n", df.head())

# Save the prepared dataset if needed
df.to_csv('zomato_df_prepared.csv', index=False)



The shape of the dataset:
The shape of the cleaned dataset
Unique values in rating_class: [2 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating_class'] = df['rating_text'].map(rating_mapping)


Shape of the prepared dataset: (6949, 104)
Prepared dataset:
                                              address   cost        lat  \
0                      371A Pitt Street, CBD, Sydney   50.0 -33.876059   
1      Shop 7A, 2 Huntley Street, Alexandria, Sydney   80.0 -33.910999   
2   Level G, The Darling at the Star, 80 Pyrmont ...  120.0 -33.867971   
3   Sydney Opera House, Bennelong Point, Circular...  270.0 -33.856784   
4              20 Campbell Street, Chinatown, Sydney   55.0 -33.879035   

                                                link         lng  \
0    https://www.zomato.com/sydney/sydney-madang-cbd  151.207605   
1  https://www.zomato.com/sydney/the-grounds-of-a...  151.193793   
2        https://www.zomato.com/sydney/sokyo-pyrmont  151.195210   
3  https://www.zomato.com/sydney/bennelong-restau...  151.215297   
4  https://www.zomato.com/sydney/chat-thai-chinatown  151.206409   

          phone  rating_number rating_text  \
0  02 8318 0406            4.0   Very 

# Part 2: Linear Regression Model

model_regression_1: prediction of restaurant rating (rating number) from other features (variables) in the dataset.

In [2]:
# Loading required libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Removing non-numeric columns that can't be used in regression
df_1 = df.select_dtypes(include=['float64', 'int64', 'uint8'])

# Defining the feature columns and target variable
# Features: all columns except rating_number
X = df_1.drop(columns=['rating_number']) 
# Target Variable: rating_number
y = df_1['rating_number']  

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initializing the Linear Regression model
model_regression_1 = LinearRegression()

# Training the model on the training set
model_regression_1.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model_regression_1.predict(X_test)

# Evaluating & printing the model(MSE) 
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 0.07149712726488254


The Mean Squared Error (MSE): 0.07149712726488254 suggest low squared error. 
It means there are relatively small number of error and the model performance is good.

model_regression_2: Gradient Descent as the optimisation function

In [3]:
# Loading required libraries
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# Initializing the SGDRegressor 
model_regression_2 = SGDRegressor(max_iter=1000, tol=1e-3, random_state=0)

# Training the model on the training set
model_regression_2.fit(X_train, y_train)

# Predicting on the test set
y_pred_gd = model_regression_2.predict(X_test)

# Calculating and printing the Mean Squared Error (MSE) for the test set
mse_gd = mean_squared_error(y_test, y_pred_gd)
print(f"Mean Squared Error (MSE) using Gradient Descent: {mse_gd}")


Mean Squared Error (MSE) using Gradient Descent: 2.4429505722639595e+27


High MSE suggest that convergance didn't happen properly, possibly because scales of the features or outliers in features affecting optimisation.

# Part 3: Binary Classification by Building a Logistic Regression Model

Binary Classification of Text Rating

In [4]:
# Loading required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# The binary Classification is already made and saved in rating_class column
print(df_1['rating_class'].head())

# Defining features (X) and target variable (y)
X = df_1.drop(columns=['rating_class'])  
y = df_1['rating_class']  

# Splitting the data into 80% training and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialising and training of the logistic regression model
model_classification_3 = LogisticRegression()

# Training the model
model_classification_3.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model_classification_3.predict(X_test)

# Model Evaluation
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, F1-score)
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)




0    2
1    2
2    2
3    2
4    2
Name: rating_class, dtype: int64
Accuracy: 0.95
Confusion Matrix:
[[883  29]
 [ 37 441]]
Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.97      0.96       912
           2       0.94      0.92      0.93       478

    accuracy                           0.95      1390
   macro avg       0.95      0.95      0.95      1390
weighted avg       0.95      0.95      0.95      1390



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluation Report:
The accuracy of 0.95 tells that the model predicted the rating_class accurate 95% of time.

# Confusion Matrix:
[[883  29]
[ 37 441]]
Confusion matrix indicated that out of 912 obs for class 1, 883 prediction were correctly classified and 29 samples were misclassified.
Confusion matrix indicated that out of 478 obs for class 1, 441 prediction were correctly classified and 37 samples were classified incorrectly.

# Classification Report:

*Class 1 (912 samples):
Precision (0.96): Out of all the instances predicted as class 1, 96% were actually class 1.
Recall (0.97): The model correctly identified 97% of all actual class 1 instances.
F1-Score (0.96): Strong balance between precision and recall.

*Class 2 (478 samples):
Precision (0.94): Out of all the instances predicted as class 2, 94% were actually class 2.
Recall (0.92): The model correctly identified 92% of all actual class 2 instances.
F1-Score (0.93): Strong balance between precision and recall.

Overall, The model is performing very well with high accuracy (0.95) and strong precision, recall, and F1-scores  for both instances. Also the performance of class 1 is just a touch better than class 2. 



# Binary Classification with Decision Tree

In [5]:
# Loading Necessary Libraries
from sklearn.tree import DecisionTreeClassifier

# Splitting the data into feature and target
X = df_1.drop(columns=['rating_class'])
y = df_1["rating_class"]

# Splitting the data into training(80%) and test(20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 40)

# Building and Training the Decision Tree Classifier
# Initialising the Decision Tree Classifier
model_classification_4 = DecisionTreeClassifier(random_state=40)

# Training the model
model_classification_4.fit(X_train, y_train)

# Predicting on the test data
y_pred = model_classification_4.predict(X_test)

# Model Evaluation
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report (Precision, Recall, F1-Score)
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 1.00
Confusion Matrix:
[[923   0]
 [  0 467]]
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       923
           2       1.00      1.00      1.00       467

    accuracy                           1.00      1390
   macro avg       1.00      1.00      1.00      1390
weighted avg       1.00      1.00      1.00      1390



The decsion tree classifier shows 100% accuracy