In [1]:
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('/content/Dataset .csv')

In [3]:
data.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


**Data Preprocessing:**

In [4]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


In [5]:
# One-hot encoding for categorical variables
data_encoded = pd.get_dummies(data, columns=['Restaurant Name', 'City', 'Cuisines'])

In [6]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Scale numerical features
numerical_features = ['Longitude', 'Latitude', 'Average Cost for two', 'Votes']
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

**Splitting the data:**

In [7]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into features and target variable
X = data_encoded.drop(['Aggregate rating'], axis=1)  # Features
y = data_encoded['Aggregate rating']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape (X, y):", X_train.shape, y_train.shape)
print("Testing set shape (X, y):", X_test.shape, y_test.shape)

Training set shape (X, y): (7640, 9429) (7640,)
Testing set shape (X, y): (1911, 9429) (1911,)


**Model Selection and Training:**

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [9]:
# Initialize the regression models
linear_reg_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)

In [10]:
# Concatenate X_train and y_train along the columns (axis=1)
train_data = pd.concat([X_train, y_train], axis=1)

# Perform one-hot encoding for categorical variables based on available columns in train_data
train_data_encoded = pd.get_dummies(train_data)

# Separate the features and target variable from train_data_encoded
X_train_encoded = train_data_encoded.drop('Aggregate rating', axis=1)
y_train_encoded = train_data_encoded['Aggregate rating']

In [11]:
nan_indices = y_train_encoded.index[y_train_encoded.isna()]
print("Indices with NaN values in y_train_encoded:", nan_indices)

Indices with NaN values in y_train_encoded: Int64Index([], dtype='int64')


In [12]:
# Impute NaN values with the mean of y_train_encoded
mean_rating = y_train_encoded.mean()
y_train_encoded.fillna(mean_rating, inplace=True)

In [13]:
# Train the regression model
linear_reg_model.fit(X_train_encoded, y_train_encoded)

In [14]:
# Train the decision tree model
decision_tree_model.fit(X_train_encoded, y_train_encoded)

In [15]:
# Train the random forest model
random_forest_model.fit(X_train_encoded, y_train_encoded)

In [16]:
# Perform one-hot encoding on the testing set X_test
X_test_encoded = pd.get_dummies(X_test)

In [17]:
# Ensure that the columns in X_test_encoded match the columns in X_train_encoded
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0
  X_test_encoded[col] = 0

In [18]:
# Reorder the columns in X_test_encoded to match the order in X_train_encoded
X_test_encoded = X_test_encoded[X_train_encoded.columns]

**Linear Regression Model Evaluation:**

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
# Predictions on the training set
y_train_pred_linear = linear_reg_model.predict(X_train_encoded)

# Predictions on the testing set
y_test_pred_linear = linear_reg_model.predict(X_test_encoded)

In [21]:
# Evaluation metrics on the training set
mae_train_linear = mean_absolute_error(y_train_encoded, y_train_pred_linear)
mse_train_linear = mean_squared_error(y_train_encoded, y_train_pred_linear)
rmse_train_linear = mean_squared_error(y_train_encoded, y_train_pred_linear, squared=False)
r2_train_linear = r2_score(y_train_encoded, y_train_pred_linear)

In [22]:
# Evaluation metrics on the testing set
mae_test_linear = mean_absolute_error(y_test, y_test_pred_linear)
mse_test_linear = mean_squared_error(y_test, y_test_pred_linear)
rmse_test_linear = mean_squared_error(y_test, y_test_pred_linear, squared=False)
r2_test_linear = r2_score(y_test, y_test_pred_linear)

In [23]:
# Print evaluation metrics for linear regression model
print("Linear Regression Model - Training Set:")
print("MAE:", mae_train_linear)
print("MSE:", mse_train_linear)
print("RMSE:", rmse_train_linear)
print("R-squared:", r2_train_linear)

print("\nLinear Regression Model - Testing Set:")
print("MAE:", mae_test_linear)
print("MSE:", mse_test_linear)
print("RMSE:", rmse_test_linear)
print("R-squared:", r2_test_linear)

Linear Regression Model - Training Set:
MAE: 2.213132699014126e-05
MSE: 4.304150579741051e-07
RMSE: 0.0006560602548349542
R-squared: 0.9999998132464518

Linear Regression Model - Testing Set:
MAE: 458.0127278318252
MSE: 3254426.40963301
RMSE: 1804.0028851509662
R-squared: -1429819.1112023657


**Decision Tree Model Evaluation:**

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [25]:
# Predictions on the training set
y_train_pred_dt = decision_tree_model.predict(X_train_encoded)

# Predictions on the testing set
y_test_pred_dt = decision_tree_model.predict(X_test_encoded)

In [26]:
# Evaluation metrics on the training set
mae_train_dt = mean_absolute_error(y_train_encoded, y_train_pred_dt)
mse_train_dt = mean_squared_error(y_train_encoded, y_train_pred_dt)
rmse_train_dt = mean_squared_error(y_train_encoded, y_train_pred_dt, squared=False)
r2_train_dt = r2_score(y_train_encoded, y_train_pred_dt)

In [27]:
# Evaluation metrics on the testing set
mae_test_dt = mean_absolute_error(y_test, y_test_pred_dt)
mse_test_dt = mean_squared_error(y_test, y_test_pred_dt)
rmse_test_dt = mean_squared_error(y_test, y_test_pred_dt, squared=False)
r2_test_dt = r2_score(y_test, y_test_pred_dt)


In [28]:
# Print evaluation metrics for decision tree regression model
print("Decision Tree Regression Model - Training Set:")
print("MAE:", mae_train_dt)
print("MSE:", mse_train_dt)
print("RMSE:", rmse_train_dt)
print("R-squared:", r2_train_dt)

print("\nDecision Tree Regression Model - Testing Set:")
print("MAE:", mae_test_dt)
print("MSE:", mse_test_dt)
print("RMSE:", rmse_test_dt)
print("R-squared:", r2_test_dt)

Decision Tree Regression Model - Training Set:
MAE: 2.9586571703361504e-17
MSE: 1.572042837956794e-32
RMSE: 1.253811324704317e-16
R-squared: 1.0

Decision Tree Regression Model - Testing Set:
MAE: 0.13799058084772373
MSE: 0.04783359497645211
RMSE: 0.21870892751886492
R-squared: 0.9789844883614522


**Random Forest Model Evaluation:**

In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [30]:
# Predictions on the training set
y_train_pred_rf = random_forest_model.predict(X_train_encoded)

# Predictions on the testing set
y_test_pred_rf = random_forest_model.predict(X_test_encoded)

In [31]:
# Evaluation metrics on the training set
mae_train_rf = mean_absolute_error(y_train_encoded, y_train_pred_rf)
mse_train_rf = mean_squared_error(y_train_encoded, y_train_pred_rf)
rmse_train_rf = mean_squared_error(y_train_encoded, y_train_pred_rf, squared=False)
r2_train_rf = r2_score(y_train_encoded, y_train_pred_rf)

In [32]:
# Evaluation metrics on the testing set
mae_test_rf = mean_absolute_error(y_test, y_test_pred_rf)
mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
rmse_test_rf = mean_squared_error(y_test, y_test_pred_rf, squared=False)
r2_test_rf = r2_score(y_test, y_test_pred_rf)

In [33]:
# Print evaluation metrics for Random Forest regression model
print("Random Forest Regression Model - Training Set:")
print("MAE:", mae_train_rf)
print("MSE:", mse_train_rf)
print("RMSE:", rmse_train_rf)
print("R-squared:", r2_train_rf)

print("\nRandom Forest Regression Model - Testing Set:")
print("MAE:", mae_test_rf)
print("MSE:", mse_test_rf)
print("RMSE:", rmse_test_rf)
print("R-squared:", r2_test_rf)

Random Forest Regression Model - Training Set:
MAE: 0.040123821989529004
MSE: 0.003847287434554986
RMSE: 0.062026505903161964
R-squared: 0.9983306936735409

Random Forest Regression Model - Testing Set:
MAE: 0.10853584510727349
MSE: 0.027028985871271515
RMSE: 0.16440494478960027
R-squared: 0.9881249158162693


**Model Selection:**

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [35]:
# Create instances of the models
linear_reg_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)

In [36]:
# Train the linear regression model
linear_reg_model.fit(X_train_encoded, y_train_encoded)

In [37]:
#Train the decision tree model
decision_tree_model.fit(X_train_encoded, y_train_encoded)

In [38]:
#Train the random forest model
random_forest_model.fit(X_train_encoded, y_train_encoded)

In [39]:
# Predictions on the testing set
y_test_pred_linear = linear_reg_model.predict(X_test_encoded)
y_test_pred_dt = decision_tree_model.predict(X_test_encoded)
y_test_pred_rf = random_forest_model.predict(X_test_encoded)

In [40]:
# Evaluate models using Mean Squared Error (MSE)
mse_linear = mean_squared_error(y_test, y_test_pred_linear)
mse_dt = mean_squared_error(y_test, y_test_pred_dt)
mse_rf = mean_squared_error(y_test, y_test_pred_rf)

In [41]:
# Print MSE for each model
print("Linear Regression MSE:", mse_linear)
print("Decision Tree MSE:", mse_dt)
print("Random Forest MSE:", mse_rf)

Linear Regression MSE: 3254426.40963301
Decision Tree MSE: 0.04783359497645211
Random Forest MSE: 0.027028985871271515


In [42]:
# Model selection based on MSE
best_model = None
if mse_linear < mse_dt and mse_linear < mse_rf:
    best_model = linear_reg_model
    best_model_name = "Linear Regression"
elif mse_dt < mse_linear and mse_dt < mse_rf:
    best_model = decision_tree_model
    best_model_name = "Decision Tree"
else:
    best_model = random_forest_model
    best_model_name = "Random Forest"

print("Best Model:", best_model_name)

Best Model: Random Forest
