# **PYTHON NOTEBOOK FOR DATA PREPROCESSING ANALYSIS OF MODELS USED FOR THE RECOMMENDATION SYSTEM PRESENT IN PHASE 3**

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
file_path = "Data.csv"
df = pd.read_csv(file_path)

**DATA PREPROCESSING**

In [3]:

df_cleaned = df.dropna(subset=['latitude', 'popular_times', 'longitude', 'rating'])

df_cleaned['city'] = df_cleaned['city'].str.lower()
df_cleaned['us_state'] = df_cleaned['us_state'].str.lower()
df_cleaned['city'] = df_cleaned['city'].str.capitalize()
df_cleaned['us_state'] = df_cleaned['us_state'].str.capitalize()

df_cleaned['working_hours'] = df_cleaned['working_hours'].astype(str)
df_cleaned['working_hours'] = df_cleaned['working_hours'].apply(lambda x: re.sub(r'[a-zA-Z\s]+', '', x))
df_cleaned['working_hours'] = df_cleaned['working_hours'].fillna('Unknown')

df_cleaned['name'] = df_cleaned['name'].str.title()

df_cleaned = df_cleaned[(df_cleaned['latitude'].between(-90, 90)) & (df_cleaned['longitude'].between(-180, 180))]
df_cleaned['latitude'] = df_cleaned['latitude'].round(6)
df_cleaned['longitude'] = df_cleaned['longitude'].round(6)

df_cleaned['rating_category'] = pd.cut(df_cleaned['rating'], bins=[0, 3, 4.5, 5], labels=['Low', 'Medium', 'High'])

df_cleaned['name'] = df_cleaned['name'].str.strip()
df_cleaned['city'] = df_cleaned['city'].str.strip()

def has_values_after_colon(working_hours):
    try:

        parts = working_hours.split(':')
        if len(parts) > 6 and parts[6].strip():  
            return True  
        if len(parts) > 7 and parts[7].strip():  
            return True  
        return False
    except Exception as e:
        return False


df_cleaned['rating'] = pd.to_numeric(df_cleaned['rating'], errors='coerce')
df_cleaned = df_cleaned.dropna(subset=['rating'])

df_cleaned['rating_scaled'] = (df_cleaned['rating'] / df_cleaned['rating'].max()) * 5
df_cleaned['popular_times'] = df_cleaned['popular_times'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df_cleaned = df_cleaned[df_cleaned['popular_times'].apply(lambda x: isinstance(x, list) and len(x) == 7)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['city'] = df_cleaned['city'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['us_state'] = df_cleaned['us_state'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['city'] = df_cleaned['city'].str.capitalize()
A value is trying to be set on a cop

In [4]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2369 entries, 0 to 3996
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   name             2369 non-null   object  
 1   popular_times    2369 non-null   object  
 2   latitude         2369 non-null   float64 
 3   longitude        2369 non-null   float64 
 4   working_hours    2369 non-null   object  
 5   city             2368 non-null   object  
 6   us_state         2368 non-null   object  
 7   rating           2369 non-null   float64 
 8   rating_category  2369 non-null   category
 9   rating_scaled    2369 non-null   float64 
dtypes: category(1), float64(4), object(5)
memory usage: 187.5+ KB


In [5]:
import pandas as pd
import numpy as np

df_hourly = df_cleaned.copy()

def safe_eval(x):
    if isinstance(x, str):
        try:
            return eval(x)
        except:
            return None
    return x

df_hourly['popular_times'] = df_hourly['popular_times'].apply(safe_eval)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for day_idx, day in enumerate(days):
    for hour in range(24):
        df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
            lambda x: x[(day_idx + 1) % 7][hour] if isinstance(x, list) and len(x) == 7 and len(x[(day_idx - 1) % 7]) == 24 else np.nan
        )

df_hourly = df_hourly.drop('popular_times', axis=1)

hourly_data_count = df_hourly.iloc[:, -168:].notna().sum().sum()
total_possible = len(df_hourly) * 168 

print(f"Places with hourly data: {hourly_data_count / 168:.0f}")
print(f"Percentage of hourly data available: {hourly_data_count / total_possible:.2%}")

Places with hourly data: 2369
Percentage of hourly data available: 100.00%


  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] = df_hourly['popular_times'].apply(
  df_hourly[f'{day}_{hour:02d}'] =

### Predicting Working Hours from Popular Times Data using Random Forest by Keshav Narayan Srinivasan UBIT: 50610509
- *Analysis:* Random Forest was used to predict the working hours of tourist spots based on features extracted from popular_times. The model handles high-dimensional data well and can capture complex relationships between popular time trends and working hours.


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

data = df_hourly

days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def robust_parse_working_hours(working_hours):
    parsed_hours = []
    try:
        for day in working_hours.split(":")[1:]:
            if '-' in day:
                times = day.split('-')
                if len(times) >= 2:
                    open_time, close_time = times[:2]
                    open_hour = float(open_time.split(":")[0]) + (float(open_time.split(":")[1]) / 60 if ":" in open_time else 0)
                    close_hour = float(close_time.split(":")[0]) + (float(close_time.split(":")[1]) / 60 if ":" in close_time else 0)
                    parsed_hours.append((open_hour, close_hour))
                else:
                    parsed_hours.append((None, None))
            else:
                parsed_hours.append((None, None))
    except Exception:
        parsed_hours = [(None, None)] * 7
    return parsed_hours

data['parsed_hours'] = data['working_hours'].fillna("").apply(robust_parse_working_hours)

def aggregate_popular_times(row, days):
    daily_hours = []
    for day in days:
        hours = [i for i in range(24) if row.get(f"{day}_{i:02}", 0) > 0]
        if hours:
            daily_hours.append((hours[0], hours[-1] + 1))
        else:
            daily_hours.append((None, None))
    return daily_hours

data['popular_hours'] = data.apply(lambda row: aggregate_popular_times(row, days_of_week), axis=1)

def hours_to_features(hours):
    return [item for sublist in hours for item in sublist]

data['parsed_features'] = data['parsed_hours'].apply(hours_to_features)
data['popular_features'] = data['popular_hours'].apply(hours_to_features)

def normalize_features(features, target_length=14, placeholder=0):
    if len(features) < target_length:
        features.extend([placeholder] * (target_length - len(features)))
    elif len(features) > target_length:
        features = features[:target_length]
    return features

data['parsed_features'] = data['parsed_features'].apply(lambda x: normalize_features(x, 14, placeholder=0))
data['popular_features'] = data['popular_features'].apply(lambda x: normalize_features(x, 14, placeholder=0))

data = data.dropna(subset=['parsed_features', 'popular_features'])

data['parsed_features'] = data['parsed_features'].apply(lambda x: [0 if v is None else v for v in x])
data['popular_features'] = data['popular_features'].apply(lambda x: [0 if v is None else v for v in x])

X = np.array(data['popular_features'].tolist())
y = np.array(data['parsed_features'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = np.nan_to_num(y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")

def features_to_hours(features):
    return [(round(features[i], 2), round(features[i + 1], 2)) for i in range(0, len(features), 2)]

sample_prediction = features_to_hours(y_pred[0])
print(f"Predicted working hours: {sample_prediction}")


  data['parsed_hours'] = data['working_hours'].fillna("").apply(robust_parse_working_hours)
  data['popular_hours'] = data.apply(lambda row: aggregate_popular_times(row, days_of_week), axis=1)
  data['parsed_features'] = data['parsed_hours'].apply(hours_to_features)
  data['popular_features'] = data['popular_hours'].apply(hours_to_features)


Mean Absolute Error: 0.1663503539427912
Predicted working hours: [(np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0)), (np.float64(0.0), np.float64(0.0))]


**CREATING A NEW working_hours DATA USING POPULAR TIMES DATA AS THE OLD working_hours DID NOT CORRELATE WELL WITH THE popular_times DATA.**

In [7]:
data.drop('working_hours', inplace=True, axis=1)  

In [8]:

def features_to_working_hours(features):
    """
    Convert features (list of open/close times) to the working hours string format.
    Example: [(0, 24), (0, 24), (0, 24), (0, 24), (0, 24), (0, 0), (0, 0)] -> ":24:24:24:24:24::"
    """
    hours = []
    for i in range(0, len(features), 2):
        open_hour = features[i]
        close_hour = features[i + 1]
        if open_hour is not None and close_hour is not None:
            if open_hour == 0 and close_hour == 0:
                # Closed day
                hours.append("")
            elif open_hour == 0 and close_hour == 24:
                # 24-hour open
                hours.append("24")
            else:
                # Format to "open-close"
                hours.append(f"{int(open_hour)}-{int(close_hour)}")
        else:
            hours.append("")
    return ":" + ":".join(hours)

data['calculated_working_hours'] = data['popular_features'].apply(features_to_working_hours)

In [9]:
def get_weekday_hours(working_hours):
    try:
        parts = working_hours.split(':')
        weekday_parts = parts[1:6]
        weekday_hours = sum(1 for part in weekday_parts if part.strip())
        return weekday_hours
    except Exception as e:
        return 0 
def get_weekend_hours(working_hours):
    try:
        parts = working_hours.split(':')
        weekend_parts = parts[6:7]
        weekend_hours = sum(1 for part in weekend_parts if part.strip())
        return weekend_hours
    except Exception as e:
        return 0 
data['weekday_hours'] = data['calculated_working_hours'].apply(get_weekday_hours)
data['weekend_hours'] = data['calculated_working_hours'].apply(get_weekend_hours)
data['is_weekend_open'] = data['calculated_working_hours'].apply(has_values_after_colon)

data['calculated_working_hours'] = data['calculated_working_hours'].apply(lambda x: re.sub(r'[^\d:-]', '', x))
data['weekday_hours'] = data['calculated_working_hours'].apply(get_weekday_hours)
data['weekend_hours'] = data['calculated_working_hours'].apply(get_weekend_hours)

### Popular Times Classification for Each Hour Using KNN by Pramila Yadav UBIT: 50613803
- *Analysis:* KNN categorizes popularity levels for each hour (e.g., High, Medium, Low) based on historical hourly visitation data. Scaling the features ensures fair distance computation.


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

numeric_columns = data.select_dtypes(include=[np.number]).columns
data_numeric = data[numeric_columns]
data_numeric.fillna(data_numeric.mean(), inplace=True)

def categorize_popularity(hourly_data):
    if hourly_data > 70:
        return 'high'
    elif hourly_data > 40:
        return 'medium'
    else:
        return 'low'
for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    for hour in range(24):
        column_name = f'{day}_{hour:02d}'
        data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
X = []
y = []
for idx, row in data.iterrows():
    for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        for hour in range(24):
            column_name = f'{day}_{hour:02d}'
            X.append(row[[column_name]]) 
            y.append(row[column_name + '_category'])  

X = np.array(X)
y = np.array(y)
assert X.shape[0] == y.shape[0], f"Inconsistent shapes: X has {X.shape[0]} samples, but y has {y.shape[0]} samples."

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
data['predicted_popularity'] = np.nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_numeric.fillna(data_numeric.mean(), inplace=True)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_popularity)
  data[column_name + '_category'] = data[column_name].apply(categorize_po

### Closest 5 Places for Each Place Using BallTree by Tharunnesh Ramamoorthy UBIT: 50611344
- *Analysis:* BallTree efficiently calculates the nearest neighbors for each tourist location based on their geographical coordinates using the haversine distance metric.


In [11]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

coords = data[['latitude', 'longitude']].values
names = data['name'].values

tree = BallTree(np.deg2rad(coords), metric='haversine')

def get_nearest_neighbors(tree, coords, names, k=5):
    distances, indices = tree.query(np.deg2rad(coords), k=k+1) 
    
    distances = np.rad2deg(distances) * 6371 
    nearest_neighbors = []
    for i, (dists, idxs) in enumerate(zip(distances, indices)):
        neighbors = [f"{names[idx]}:{dists[j]:.2f}km" for j, idx in enumerate(idxs[1:], 1)] 
        nearest_neighbors.append(", ".join(neighbors))
    return nearest_neighbors

data['nearest_neighbors'] = get_nearest_neighbors(tree, coords, names, k=5)
data.to_csv('nearest_neighbours.csv')

  data['nearest_neighbors'] = get_nearest_neighbors(tree, coords, names, k=5)


In [12]:
data.head()

Unnamed: 0,name,latitude,longitude,city,us_state,rating,rating_category,rating_scaled,Monday_00,Monday_01,...,Sunday_16_category,Sunday_17_category,Sunday_18_category,Sunday_19_category,Sunday_20_category,Sunday_21_category,Sunday_22_category,Sunday_23_category,predicted_popularity,nearest_neighbors
0,Mardis Mill Falls,34.044364,-86.571446,Blountsville,Alabama,4.6,High,4.6,0,0,...,low,low,low,low,low,low,low,low,,"Easley Covered Bridge:544.56km, Horton Mill Co..."
1,Waterville Usa/Escape House,30.258331,-87.687064,Gulf shores,Alabama,4.3,Medium,4.3,0,0,...,low,low,low,low,low,low,low,low,,"Bamahenge:807.15km, Uss Alabama Battleship Mem..."
2,Bama Bison Rv Park & Farm,32.425044,-85.250269,Opelika,Alabama,5.0,High,5.0,0,0,...,low,low,low,low,low,low,low,low,,"Dixie Walking Trail:209.42km, Museum Of Wonder..."
3,The Mobile Tunnel,30.690009,-88.03562,Mobile,Alabama,4.8,High,4.8,0,0,...,low,low,low,low,low,low,low,low,,"Cooper Riverside Park:10.93km, Exploreum Scien..."
4,Bamahenge,30.331442,-87.567232,Elberta,Alabama,4.5,Medium,4.5,0,0,...,low,low,low,low,low,low,low,low,,"Waterville Usa/Escape House:807.15km, Uss Alab..."


### Predicting Rating Classification Using Decision Tree by Hari Chandan Gooda UBIT : 50614165
- *Analysis:* A Decision Tree Classifier categorizes locations into High, Medium, or Low rating categories based on features like location and visitation data. The simple tree structure ensures interpretability.


In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

df_final = pd.read_csv('nearest_neighbours.csv')

X = df_final.drop(['rating', 'rating_category'], axis=1)
y = df_final['rating_category']

X = pd.get_dummies(X)
clf = DecisionTreeClassifier(max_depth = 2, random_state=42)
clf.fit(X, y)

df_final['predicted_rating_category'] = clf.predict(X)

# df_final.to_csv('updated_with_predictions.csv', index=False)


In [14]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

X = df_final.drop(['rating', 'rating_category'], axis=1)
y = df_final['rating_category']

X = pd.get_dummies(X)

clf = DecisionTreeClassifier(max_depth = 2, random_state=42)
clf.fit(X, y)

df_final['predicted_rating_category'] = clf.predict(X)



In [15]:
df_final.head()
# df_final.to_csv("Final.csv")

Unnamed: 0.1,Unnamed: 0,name,latitude,longitude,city,us_state,rating,rating_category,rating_scaled,Monday_00,...,Sunday_17_category,Sunday_18_category,Sunday_19_category,Sunday_20_category,Sunday_21_category,Sunday_22_category,Sunday_23_category,predicted_popularity,nearest_neighbors,predicted_rating_category
0,0,Mardis Mill Falls,34.044364,-86.571446,Blountsville,Alabama,4.6,High,4.6,0,...,low,low,low,low,low,low,low,,"Easley Covered Bridge:544.56km, Horton Mill Co...",High
1,1,Waterville Usa/Escape House,30.258331,-87.687064,Gulf shores,Alabama,4.3,Medium,4.3,0,...,low,low,low,low,low,low,low,,"Bamahenge:807.15km, Uss Alabama Battleship Mem...",Medium
2,2,Bama Bison Rv Park & Farm,32.425044,-85.250269,Opelika,Alabama,5.0,High,5.0,0,...,low,low,low,low,low,low,low,,"Dixie Walking Trail:209.42km, Museum Of Wonder...",High
3,3,The Mobile Tunnel,30.690009,-88.03562,Mobile,Alabama,4.8,High,4.8,0,...,low,low,low,low,low,low,low,,"Cooper Riverside Park:10.93km, Exploreum Scien...",High
4,4,Bamahenge,30.331442,-87.567232,Elberta,Alabama,4.5,Medium,4.5,0,...,low,low,low,low,low,low,low,,"Waterville Usa/Escape House:807.15km, Uss Alab...",Medium
