<a href="https://colab.research.google.com/github/MichelleThuo/MLInternshipTasks/blob/main/Task1%3APredictRestaurantRatings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import necessary libraries

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Data Preprocessing

In [12]:
# Load the dataset
# Dataset .csv is the actual file path of the dataset
data = pd.read_csv('Dataset .csv')

# Explore the dataset
# Display the first five rows of the dataset
print(data.head())
# Print summary information about the dataset (data types and non-null counts)
print(data.info())
# Generate descriptive statistics for numerical columns
print(data.describe())

# Identify the target variable and features
# Assuming 'Aggregate rating' is the column for ratings and the rest are features
target = 'Aggregate rating'
# Drop the target column to isolate the feature set
features = data.drop(columns=[target])

# Split features into numerical and categorical columns
# This will help apply different preprocessing techniques to each type
numerical_features = features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = features.select_dtypes(include=['object', 'category']).columns

# Define preprocessing for numerical data (imputing missing values and scaling)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean
    ('scaler', StandardScaler())  # Standardize numerical features to have mean=0 and variance=1
])

# Define preprocessing for categorical data (imputing missing values and encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine preprocessing steps into a single column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),  # Apply numerical transformations
        ('cat', categorical_transformer, categorical_features)  # Apply categorical transformations
    ]
)

   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3      SM 

# Step 2: Model Selection

In [13]:
# Selected the regression model
# Uncomment the model you want to use
model = LinearRegression()
# model = DecisionTreeRegressor(random_state=42)
# model = RandomForestRegressor(random_state=42)

# Create and compile the full pipeline including preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])  # Combine preprocessor and model into a single pipeline

# Step 3: Training and Evaluation

In [14]:
# Split the data into training and testing sets
# 80% training data, 20% testing data for evaluation
X_train, X_test, y_train, y_test = train_test_split(features, data[target], test_size=0.2, random_state=42)

# Fit the model to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance using Mean Squared Error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}') # Lower values are better
print(f'R-squared: {r2}') # Values closer to 1 are better

Mean Squared Error (MSE): 0.09178572062263937
R-squared: 0.9596742858037922


# Step 4: Feature Interpretation

In [15]:
# Interpret the model's results
# Feature importances for tree-based models like DecisionTreeRegressor or RandomForestRegressor
if hasattr(model, 'feature_importances_'):
    feature_importances = model.feature_importances_
    # Combine numerical and categorical feature names for interpretability
    feature_names = numerical_features.tolist() + list(pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features))
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Plot the most influential features
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances')
    plt.gca().invert_yaxis()  # Reverse the y-axis for better visibility
    plt.show()

    print("Most influential features:")
    print(feature_importance_df.head(10))  # Display the top 10 features by importance
else:
    print("The selected model does not provide feature importances.")

# For LinearRegression, display coefficients to interpret the model
if isinstance(model, LinearRegression):
    coefficients = model.coef_
    # Combine numerical and categorical feature names for interpretability
    feature_names = numerical_features.tolist() + list(pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features))
    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    coef_df = coef_df.sort_values(by='Coefficient', key=abs, ascending=False)  # Sort by absolute value of coefficients

    print("Most influential features:")
    print(coef_df.head(10))  # Display the top 10 features by coefficient

The selected model does not provide feature importances.
Most influential features:
                                                 Feature  Coefficient
7801   Address_36, Food Court, 2nd Floor, Pacific Mal...     1.264945
17175                              Rating text_Not rated    -1.150447
17170                                 Rating color_White    -1.150447
10880  Address_K 11, Som Vihar Apartments, R K Puram,...     1.129207
9913    Address_E-586, Greater Kailash (GK) 2, New Delhi    -1.091802
7075   Address_2, Ground Floor, North West Avenue, Pu...    -1.047211
7719   Address_32, Defence Colony Market, Defence Col...    -1.043531
9919   Address_E-778, Market 2, Chittaranjan Park, Ne...     1.024175
10560  Address_Ground Floor, Shipra Mall, Indirapuram...     0.983575
8381   Address_67, 2nd Floor, Food Court, Moments Mal...    -0.982331


# Displaying a few actual vs. predicted ratings

In [16]:
comparison_df = pd.DataFrame({'Actual Rating': y_test, 'Predicted Rating': y_pred})
print(comparison_df.head(10))

      Actual Rating  Predicted Rating
4731            2.1          2.215404
1468            4.1          3.875280
9037            3.2          2.889494
7866            4.4          3.973292
5570            3.5          3.048742
5613            0.0          0.372515
7751            3.2          2.911395
1662            0.0          0.292812
8592            3.6          3.043831
2164            4.0          4.076226


# Limitations and Future Improvements
## 1. Handling Missing Values
While we used SimpleImputer to handle missing values, we should consider whether mean imputation is appropriate, especially for skewed distributions. Future work could explore more sophisticated imputation methods (e.g., KNN imputation) or model-based imputation.

## 2. Potential Overfitting
The chosen model may overfit if it is too complex, especially on datasets with high variance. Cross-validation techniques could be implemented to validate the model's performance across different subsets of data.

## 3. Feature Selection
The model might include irrelevant features that do not contribute significantly to predictions. Techniques like Recursive Feature Elimination (RFE) or regularization methods (Lasso, Ridge) could help select more relevant features.

## 4. Advanced Models
Explore more complex models such as Gradient Boosting or ensemble methods to improve prediction accuracy and robustness.

## 5. Model Interpretability
For models like Random Forest or Gradient Boosting, feature importances could help interpret results, but advanced techniques like SHAP or LIME could provide better insights into model predictions.