In [None]:
# pandas is used for data handling
import pandas as pd

# numpy is used for numerical operations
import numpy as np

# sklearn tools for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the same Excel dataset
df = pd.read_excel("Dataset .xlsx")

# View first 5 rows
df.head()


Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [None]:
# Rename columns for easier access and consistency
df = df.rename(columns={'Has Table booking': 'Has_Table_Booking', 'Has Online delivery': 'Has_Online_Delivery'})

# Create new features for length of restaurant name and address
# Convert to string type before applying len() to handle non-string entries
df['Restaurant_Name_Length'] = df['Restaurant Name'].astype(str).apply(len)
df['Address_Length'] = df['Address'].astype(str).apply(len)

# Convert categorical string columns to numerical (0 or 1)
df['Has_Table_Booking'] = df['Has_Table_Booking'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Has_Online_Delivery'] = df['Has_Online_Delivery'].apply(lambda x: 1 if x == 'Yes' else 0)

# Select input features (X)
X = df[
    [
        'Price range',
        'Average Cost for two',
        'Votes',
        'Has_Table_Booking',
        'Has_Online_Delivery',
        'Restaurant_Name_Length',
        'Address_Length'
    ]
]

# Select target variable (y)
y = df['Aggregate rating']

In [None]:
# Fill missing values with 0
X = X.fillna(0)


In [None]:
X.head()


Unnamed: 0,Price range,Average Cost for two,Votes,Has_Table_Booking,Has_Online_Delivery,Restaurant_Name_Length,Address_Length
0,3,1100,314,Yes,No,16,71
1,3,1200,591,Yes,No,16,67
2,4,4000,270,Yes,No,22,56
3,4,1500,365,No,No,4,70
4,4,1500,229,Yes,No,11,64


In [None]:
# Split data into training and testing sets
# 80% for training, 20% for testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Create Linear Regression model
lr_model = LinearRegression()

# Convert 'Has_Table_Booking' and 'Has_Online_Delivery' to numerical in X_train and X_test
X_train['Has_Table_Booking'] = X_train['Has_Table_Booking'].apply(lambda x: 1 if x == 'Yes' else 0)
X_train['Has_Online_Delivery'] = X_train['Has_Online_Delivery'].apply(lambda x: 1 if x == 'Yes' else 0)
X_test['Has_Table_Booking'] = X_test['Has_Table_Booking'].apply(lambda x: 1 if x == 'Yes' else 0)
X_test['Has_Online_Delivery'] = X_test['Has_Online_Delivery'].apply(lambda x: 1 if x == 'Yes' else 0)

# Train the model
lr_model.fit(X_train, y_train)

# Predict ratings
lr_predictions = lr_model.predict(X_test)

In [None]:
# Evaluate Linear Regression performance
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

print("Linear Regression Performance:")
print("MAE:", lr_mae)
print("MSE:", lr_mse)
print("R2 Score:", lr_r2)


Linear Regression Performance:
MAE: 1.0754548295330513
MSE: 1.6777850197076845
R2 Score: 0.2628714060482863


In [None]:
# Create Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)

# Train model
dt_model.fit(X_train, y_train)

# Predict ratings
dt_predictions = dt_model.predict(X_test)


In [None]:
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("\nDecision Tree Performance:")
print("MAE:", dt_mae)
print("MSE:", dt_mse)
print("R2 Score:", dt_r2)



Decision Tree Performance:
MAE: 0.3075876504447933
MSE: 0.2251098901098901
R2 Score: 0.9010988089461912


In [None]:
# Create Random Forest model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)

# Train model
rf_model.fit(X_train, y_train)

# Predict ratings
rf_predictions = rf_model.predict(X_test)


In [None]:
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("\nRandom Forest Performance:")
print("MAE:", rf_mae)
print("MSE:", rf_mse)
print("R2 Score:", rf_r2)



Random Forest Performance:
MAE: 0.2171758198151055
MSE: 0.11111923792317134
R2 Score: 0.9511801770494029


In [None]:
# Create a comparison table
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'MAE': [lr_mae, dt_mae, rf_mae],
    'MSE': [lr_mse, dt_mse, rf_mse],
    'R2 Score': [lr_r2, dt_r2, rf_r2]
})

model_comparison


Unnamed: 0,Model,MAE,MSE,R2 Score
0,Linear Regression,1.075455,1.677785,0.262871
1,Decision Tree,0.307588,0.22511,0.901099
2,Random Forest,0.217176,0.111119,0.95118
