In [71]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the datasets
zomato_df = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset4/main/zomato.csv', encoding='latin1')
country_code_df = pd.read_excel('https://github.com/dsrscientist/dataset4/raw/main/Country-Code.xlsx')

# Merge datasets based on 'Country Code'
zomato_df_merged = pd.merge(zomato_df, country_code_df, on='Country Code', how='left')

# Check for missing values
print("Missing Values:")
print(zomato_df_merged.isnull().sum())

# Fill missing values in 'Cuisines' column with a placeholder value
zomato_df_merged['Cuisines'].fillna('Not Available', inplace=True)

# Convert categorical variables into numerical format using one-hot encoding
zomato_df_encoded = pd.get_dummies(zomato_df_merged, columns=['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text'], drop_first=True)

# Split the 'Cuisines' column into separate binary columns using one-hot encoding
cuisines_encoded = zomato_df_encoded['Cuisines'].str.get_dummies(sep=', ')

# Concatenate the one-hot encoded features with the original dataset
zomato_df_encoded = pd.concat([zomato_df_encoded, cuisines_encoded], axis=1)

# Drop the original 'Cuisines' column
zomato_df_encoded.drop('Cuisines', axis=1, inplace=True)

# Verify the changes
print(zomato_df_encoded.head())

Missing Values:
Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
Country                 0
dtype: int64
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302   

In [73]:
# Print column names of the DataFrame
print(zomato_df_encoded.columns)

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude',
       'Average Cost for two',
       ...
       'Teriyaki', 'Tex-Mex', 'Thai', 'Tibetan', 'Turkish', 'Turkish Pizza',
       'Vegetarian', 'Vietnamese', 'Western', 'World Cuisine'],
      dtype='object', length=174)


In [74]:
# Define features (X) and target variable (y)
X = zomato_df_encoded.drop(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Currency'], axis=1)
y = zomato_df_encoded['Aggregate rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
# Initialize Linear Regression model
linear_reg_model = LinearRegression()

In [78]:
# Print column names of the DataFrame
print(zomato_df_encoded.columns)

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude',
       'Average Cost for two',
       ...
       'Country_New Zealand', 'Country_Phillipines', 'Country_Qatar',
       'Country_Singapore', 'Country_South Africa', 'Country_Sri Lanka',
       'Country_Turkey', 'Country_UAE', 'Country_United Kingdom',
       'Country_United States'],
      dtype='object', length=187)


In [79]:
# Define features (X) and target variable (y)
X = zomato_df_encoded.drop(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Currency'], axis=1)
y = zomato_df_encoded['Aggregate rating']

In [80]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Linear Regression model
linear_reg_model = LinearRegression()

# Fit the model on the training data
linear_reg_model.fit(X_train, y_train)

In [81]:
# Predict the target variable on the testing data
y_pred = linear_reg_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Mean Squared Error: 6.3354541835111265e-24
R-squared Score: 1.0


In [82]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV, LassoCV

In [83]:
k = 5

# Perform k-fold cross-validation for Linear Regression
cv_scores_linear = cross_val_score(LinearRegression(), X_train, y_train, cv=k, scoring='neg_mean_squared_error')
cv_scores_linear = -cv_scores_linear  # convert negative scores to positive


In [84]:
# Perform k-fold cross-validation for Ridge Regression
cv_scores_ridge = cross_val_score(RidgeCV(), X_train, y_train, cv=k, scoring='neg_mean_squared_error')
cv_scores_ridge = -cv_scores_ridge  # convert negative scores to positive


In [85]:
# Perform k-fold cross-validation for Lasso Regression
cv_scores_lasso = cross_val_score(LassoCV(), X_train, y_train, cv=k, scoring='neg_mean_squared_error')
cv_scores_lasso = -cv_scores_lasso  # convert negative scores to positive


In [86]:
print("Average Cross-Validation MSE (Linear Regression):", np.mean(cv_scores_linear))
print("Average Cross-Validation MSE (Ridge Regression):", np.mean(cv_scores_ridge))
print("Average Cross-Validation MSE (Lasso Regression):", np.mean(cv_scores_lasso))


Average Cross-Validation MSE (Linear Regression): 4.67339152391852e-24
Average Cross-Validation MSE (Ridge Regression): 2.014110849621547e-06
Average Cross-Validation MSE (Lasso Regression): 0.7158917892449347


In [87]:
# Calculate R-squared and RMSE on the test set for Linear Regression
y_pred_linear = linear_reg_model.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)

In [88]:
print("\nLinear Regression Test Set Metrics:")
print("R-squared Score:", r2_linear)
print("Root Mean Squared Error:", rmse_linear)



Linear Regression Test Set Metrics:
R-squared Score: 1.0
Root Mean Squared Error: 2.5170328133560607e-12


In [89]:
ridge_model = RidgeCV().fit(X_train, y_train)
lasso_model = LassoCV().fit(X_train, y_train)

In [90]:
print("\nOptimal Alpha (Ridge):", ridge_model.alpha_)
print("Optimal Alpha (Lasso):", lasso_model.alpha_)


Optimal Alpha (Ridge): 0.1
Optimal Alpha (Lasso): 1.213848110268016


In [91]:
ridge_model = RidgeCV(alphas=[ridge_model.alpha_]).fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)

In [92]:
print("\nRidge Regression Test Set Metrics:")
print("R-squared Score:", r2_ridge)
print("Root Mean Squared Error:", rmse_ridge)


Ridge Regression Test Set Metrics:
R-squared Score: 0.9999944550626075
Root Mean Squared Error: 0.0035525880495274045


In [93]:
lasso_model = LassoCV(alphas=[lasso_model.alpha_]).fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)

In [94]:
print("\nLasso Regression Test Set Metrics:")
print("R-squared Score:", r2_lasso)
print("Root Mean Squared Error:", rmse_lasso)


Lasso Regression Test Set Metrics:
R-squared Score: 0.6902208767580209
Root Mean Squared Error: 0.8396969973873064
