### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, max_error
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

### Load data from CSV file

In [2]:
restaurants_df = pd.read_csv("../Foursquare/final_restaurants_dataset_cleaned.csv")
coffeeshops_df = pd.read_csv("../Foursquare/final_coffeeshops_dataset_cleaned.csv")
print(restaurants_df.info())
print(coffeeshops_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1221 entries, 0 to 1220
Data columns (total 36 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Business ID                               1221 non-null   object 
 1   Name                                      1221 non-null   object 
 2   Latitude                                  1221 non-null   float64
 3   Longitude                                 1221 non-null   float64
 4   Category                                  1221 non-null   object 
 5   Rating                                    1221 non-null   object 
 6   Popularity                                1221 non-null   float64
 7   Google Place ID                           1221 non-null   object 
 8   Business Status                           1221 non-null   object 
 9   Distance (m)                              1221 non-null   float64
 10  Cluster                             

### Combine and convert generalCategory to numerical

In [3]:
combined_df = pd.concat([coffeeshops_df, restaurants_df], ignore_index=True)
combined_df["generalCategory"] = combined_df["generalCategory"].map({"coffee shop": 0, "restaurant": 1})

# Pre-processing

### Correlation Function 

In [4]:
# Function to remove highly correlated features
def remove_highly_correlated_features(X, threshold=0.9):
    """Removes highly correlated features from X."""
    corr_matrix = X.corr().abs()  # Compute absolute correlation matrix
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  # Upper triangle

    # Find columns to drop
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
    return X.drop(columns=to_drop, errors='ignore'), to_drop  # Drop and return removed features

results = []

### info and null values

In [5]:
combined_df.info()
print(combined_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Data columns (total 36 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Business ID                               1910 non-null   object 
 1   Name                                      1910 non-null   object 
 2   Latitude                                  1910 non-null   float64
 3   Longitude                                 1910 non-null   float64
 4   Category                                  1910 non-null   object 
 5   Rating                                    1910 non-null   object 
 6   Popularity                                1910 non-null   float64
 7   Google Place ID                           1909 non-null   object 
 8   Business Status                           1910 non-null   object 
 9   Distance (m)                              1910 non-null   float64
 10  Cluster                             

### Filling missing values with the median

In [6]:
combined_df["Avg Rating - Business Type"] = combined_df["Avg Rating - Business Type"].fillna(combined_df["Avg Rating - Business Type"].median())
combined_df["Avg Rating - Food & Dining"] = combined_df["Avg Rating - Food & Dining"].fillna(combined_df["Avg Rating - Food & Dining"].median())

### Drop irrelevent columns

In [7]:
irrelevant_columns = ['Business ID', 'Latitude'	, 'Longitude', 'Name', 'Category', 'Rating', 'Google Place ID', 'Business Status',
                'Cluster', 'Distance (m)', 'Avg Rating - Business Type', 'Competition - Business Type/Area', 'Competition - Food & Dining/Area',
                'Competition - Business Type/POI Density', 'Competition - Food & Dining/POI Density', 'Competition - Business Type/related POIs'
               , 'Competition - Food & Dining/POI Density']
combined_df.drop(columns=irrelevant_columns, inplace=True)

# SVR

In [8]:
transformed_df = combined_df.copy()

# Remove outliers in 'Popularity' using IQR method
Q1 = transformed_df['Popularity'].quantile(0.25)
Q3 = transformed_df['Popularity'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
transformed_df = transformed_df[(transformed_df['Popularity'] >= lower_bound) & (transformed_df['Popularity'] <= upper_bound)]

num_cols = transformed_df.select_dtypes(include=['number']).columns

# Apply PowerTransformer (normalize distribution)
power_transformer = PowerTransformer()
transformed_df[num_cols] = power_transformer.fit_transform(transformed_df[num_cols])

# Apply MinMaxScaler (scale to 0-1)
minmax_scaler = MinMaxScaler()
transformed_df[num_cols] = minmax_scaler.fit_transform(transformed_df[num_cols])


## SVR MODEL EXP 1 

In [9]:
# Define features (X) and target variable (y)
X = transformed_df.drop(columns=['Popularity'])
y = transformed_df['Popularity']

print(X.columns)
# Apply function and capture removed features
X, removed_corr_features = remove_highly_correlated_features(X, threshold=0.9)
print(f"Removed highly correlated features: {removed_corr_features}")
    
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'svr__regressor__C': [0.01, 0.1, 1, 10],
    'svr__regressor__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 2],
    'svr__regressor__epsilon': [0.001, 0.01, 0.05, 0.1],
    'svr__regressor__kernel': ['rbf', 'sigmoid']
}

# Check for NaNs, infinite values, or constant features
def check_nan_inf(X, y):
    if X.isnull().sum().sum() > 0 or np.isnan(y).sum() > 0:
        raise ValueError("NaNs detected in the data. Please handle missing values.")
    if np.isinf(X).sum().sum() > 0 or np.isinf(y).sum() > 0:
        raise ValueError("Infinite values detected in the data. Check transformations and scaling.")
    if (X.var() == 0).any():
        raise ValueError("Constant features detected. Consider dropping them.")


# Set up the pipeline
model = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svr', TransformedTargetRegressor(
        regressor=SVR(),
        transformer=PowerTransformer()
    ))
])

# Replace negative or zero values to avoid PowerTransformer issues
X[X <= 0] = 1e-6

# Check for NaNs or Infs before splitting
check_nan_inf(X, y)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='r2')

grid_search.fit(X_train, y_train)

# Check for NaNs in grid search results
if np.isnan(grid_search.cv_results_['mean_test_score']).any():
    print("Warning: NaN values found in GridSearchCV results. Check your data and parameter grid.")

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score (MSE): ", -grid_search.best_score_)

# Test set predictions using the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Predictions on test data completed with the best model.")

# Calculate and print evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
max_err = max_error(y_test, y_pred)
mad = np.mean(np.abs(y_test - np.mean(y_test)))

# Print results in table format
print(f"{'Metric':<15} │ {'Value':>10}")
print("-" * 35)
print(f"{'RMSE':<15} │ {rmse:>10.4f}")
print(f"{'MSE':<15} │ {mse:>10.4f}")
print(f"{'MAE':<15} │ {mae:>10.4f}")
print(f"{'R²':<15} │ {r2:>10.4f}")
print(f"{'MedAE':<15} │ {medae:>10.4f}")
print(f"{'Max Error':<15} │ {max_err:>10.4f}")
print(f"{'MAD':<15} │ {mad:>10.4f}")

print("Model evaluation completed!")


Index(['generalCategory', 'Religious Institutions', 'Coffee Shops',
       'Food & Dining', 'Restaurants', 'Home & Construction Services',
       'Entertainment & Recreation', 'Retail & Shopping', 'Finance & Services',
       'Education', 'Health', 'Public & Government Services',
       'Hotels & Hospitality', 'Transportation & Travel', 'Beauty & Wellness',
       'POI Density', 'Avg Rating - Food & Dining',
       'Competition - Food & Dining/related POIs', 'Population Within 1km'],
      dtype='object')
Removed highly correlated features: ['Restaurants']


Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters found:  {'svr__regressor__C': 0.1, 'svr__regressor__epsilon': 0.001, 'svr__regressor__gamma': 2, 'svr__regressor__kernel': 'rbf'}
Best score (MSE):  -0.008647327612126698
Predictions on test data completed with the best model.
Metric          │      Value
-----------------------------------
RMSE            │     0.2839
MSE             │     0.0806
MAE             │     0.2349
R²              │     0.0331
MedAE           │     0.2160
Max Error       │     0.6501
MAD             │     0.2440
Model evaluation completed!


## SVR MODEL EXP 2

In [11]:
# Define features (X) and target variable (y)
X = transformed_df[['POI Density', 'Avg Rating - Food & Dining', 'Competition - Food & Dining/related POIs', 'Population Within 1km']]
y = transformed_df['Popularity']


print(X.columns)
# Apply function and capture removed features
X, removed_corr_features = remove_highly_correlated_features(X, threshold=0.9)
print(f"Removed highly correlated features: {removed_corr_features}")
    
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'svr__regressor__C': [0.01, 0.1, 1, 10],
    'svr__regressor__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 2],
    'svr__regressor__epsilon': [0.001, 0.01, 0.05, 0.1],
    'svr__regressor__kernel': ['rbf', 'sigmoid']
}

# Check for NaNs, infinite values, or constant features
def check_nan_inf(X, y):
    if X.isnull().sum().sum() > 0 or np.isnan(y).sum() > 0:
        raise ValueError("NaNs detected in the data. Please handle missing values.")
    if np.isinf(X).sum().sum() > 0 or np.isinf(y).sum() > 0:
        raise ValueError("Infinite values detected in the data. Check transformations and scaling.")
    if (X.var() == 0).any():
        raise ValueError("Constant features detected. Consider dropping them.")


# Set up the pipeline
model = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svr', TransformedTargetRegressor(
        regressor=SVR(),
        transformer=PowerTransformer()
    ))
])

# Replace negative or zero values to avoid PowerTransformer issues
X[X <= 0] = 1e-6

# Check for NaNs or Infs before splitting
check_nan_inf(X, y)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='r2')

grid_search.fit(X_train, y_train)

# Check for NaNs in grid search results
if np.isnan(grid_search.cv_results_['mean_test_score']).any():
    print("Warning: NaN values found in GridSearchCV results. Check your data and parameter grid.")

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score (MSE): ", -grid_search.best_score_)

# Test set predictions using the best estimator
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Predictions on test data completed with the best model.")

# Calculate and print evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
medae = median_absolute_error(y_test, y_pred)
max_err = max_error(y_test, y_pred)
mad = np.mean(np.abs(y_test - np.mean(y_test)))

# Print results in table format
print(f"{'Metric':<15} │ {'Value':>10}")
print("-" * 35)
print(f"{'RMSE':<15} │ {rmse:>10.4f}")
print(f"{'MSE':<15} │ {mse:>10.4f}")
print(f"{'MAE':<15} │ {mae:>10.4f}")
print(f"{'R²':<15} │ {r2:>10.4f}")
print(f"{'MedAE':<15} │ {medae:>10.4f}")
print(f"{'Max Error':<15} │ {max_err:>10.4f}")
print(f"{'MAD':<15} │ {mad:>10.4f}")

print("Model evaluation completed!")


Index(['POI Density', 'Avg Rating - Food & Dining',
       'Competition - Food & Dining/related POIs', 'Population Within 1km'],
      dtype='object')
Removed highly correlated features: []
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters found:  {'svr__regressor__C': 0.1, 'svr__regressor__epsilon': 0.05, 'svr__regressor__gamma': 'scale', 'svr__regressor__kernel': 'rbf'}
Best score (MSE):  0.012372930180701314
Predictions on test data completed with the best model.
Metric          │      Value
-----------------------------------
RMSE            │     0.2941
MSE             │     0.0865
MAE             │     0.2443
R²              │    -0.0376
MedAE           │     0.2229
Max Error       │     0.7056
MAD             │     0.2440
Model evaluation completed!
