# Data Preprocessing

## Import Libraries

In [None]:
import re
import time

import numpy as np
import pandas as pd
import requests
import spacy
from afinn import Afinn
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Intial Data Preprocessing

Step 01 : Two functions: one to clean string columns (name, formatted_address, and latest_reviews) by retaining only English characters and spaces, and another to clean numerical columns (rating and user_ratings_total) by removing non-numeric characters and converting them to floats

In [None]:
places_csv = 'Places Dataset.xlsx - places_final_dataset.csv'
df_places= pd.read_csv(places_csv)

df_places.info(), df_places.head()


def clean_encoding(text):
    if isinstance(text, str):
        text = text.replace("Ã¢Â€Â™", "'")
        text = text.replace("Ã¢Â€Â", '"')
        
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        return text
    return text

def safe_literal_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return x
    return x

df_places['latest_reviews'] = df_places['latest_reviews'].apply(clean_encoding)

df_places['latest_reviews'] = df_places['latest_reviews'].apply(safe_literal_eval)

print(df_places['latest_reviews'].head(2))

df_cleaned = pd.read_csv('cleaned_places_data.csv')

def clean(name):
    name = re.sub(r'[^\x00-\x7F]+', '', name)
    name = re.sub(r'[^a-zA-Z0-9\s-]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

df_cleaned['name'] = df_cleaned['name'].apply(clean)
df_cleaned['formatted_address'] = df_cleaned['formatted_address'].apply(clean)

df_cleaned.head()



Step 02: The provided script preprocesses a dataset containing place reviews and ratings by first cleaning the data to remove rows with missing values in the 'latest_reviews' or 'rating' columns. It then performs sentiment analysis on the reviews using the SentimentIntensityAnalyzer from NLTK, extracting sentiment features such as negativity, neutrality, positivity, and compound scores. Additionally, it applies TF-IDF vectorization to convert text reviews into numerical features. These sentiment and TF-IDF features are combined and used to train a RandomForestRegressor model to predict ratings. The model's performance is evaluated with Mean Squared Error and R² score. For rows with missing ratings, sentiment analysis and TF-IDF vectorization are applied to predict the missing values, which are then filled in the dataset. The updated dataset, with predictions filled in, is saved to a new CSV file.

In [None]:
nltk.download('vader_lexicon')

df = pd.read_csv('finalcleaned_places_dataset.csv')  

df_clean = df.dropna(subset=['latest_reviews', 'rating'])

df_clean['rating'] = df_clean['rating'].astype(float)

sia = SentimentIntensityAnalyzer()

def extract_sentiment(review):
    sentiment = sia.polarity_scores(str(review))
    return pd.Series([sentiment['neg'], sentiment['neu'], sentiment['pos'], sentiment['compound']])

sentiment_features = df_clean['latest_reviews'].apply(extract_sentiment)

tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_features = tfidf.fit_transform(df_clean['latest_reviews'])

tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf.get_feature_names_out())

X = pd.concat([sentiment_features.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

y = df_clean['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Model Mean Squared Error: {mse:.2f}')
print(f'Model R² Score: {r2:.2f}')

df_missing = df[df['rating'].isnull()]

sentiment_missing = df_missing['latest_reviews'].apply(extract_sentiment)

tfidf_missing = tfidf.transform(df_missing['latest_reviews'])
tfidf_missing_df = pd.DataFrame(tfidf_missing.toarray(), columns=tfidf.get_feature_names_out())

X_missing = pd.concat([sentiment_missing.reset_index(drop=True), tfidf_missing_df.reset_index(drop=True)], axis=1)

predicted_ratings = model.predict(X_missing)

df.loc[df['rating'].isnull(), 'rating'] = predicted_ratings

df.to_csv('2Rating_updated_dataset_with_sentiment_predictions.csv', index=False)

print("Predicted ratings (with sentiment analysis) have been filled and saved to 'Rating_updated_dataset_with_sentiment_predictions.csv'.")


Step 03 : The provided script loads a dataset with reviews and ratings from a CSV file and preprocesses the text data in the 'latest_reviews' column. It starts by downloading the NLTK stopwords and defining a set of common English stopwords. The clean_reviews function removes these stopwords from each review by tokenizing the text, filtering out stopwords, and then rejoining the cleaned tokens into a string. This function is applied to the 'latest_reviews' column of the dataset. The cleaned reviews are displayed and optionally saved to a new CSV file.

In [None]:
file_path = '2Rating_updated_dataset_with_sentiment_predictions.csv'
df = pd.read_csv(file_path)

stop_words = set(stopwords.words('english'))


def clean_reviews(text):
    if isinstance(text, str):
        tokens = text.split()
        cleaned_tokens = [word for word in tokens if word.lower() not in stop_words]

        return ' '.join(cleaned_tokens)
    return text


df['latest_reviews'] = df['latest_reviews'].apply(clean_reviews)

df[['name', 'latest_reviews']].head()

cleaned_file_path = '2finalcleaned_places_reviews.csv'
df.to_csv(cleaned_file_path, index=False)


Step 04 : The provided script performs text processing on the 'latest_reviews' column of a dataset by retaining only nouns and verbs. It begins by downloading necessary NLTK data for tokenization and part-of-speech tagging. The filter_nouns_verbs_unique function tokenizes the text, applies part-of-speech tagging, filters out only the nouns and verbs, and then removes duplicate words by converting the list to a set and back to a list. This function is applied to the 'latest_reviews' column, replacing the original review texts with lists of unique nouns and verbs. The processed data is then saved to a new CSV file.

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

file_path = '2finalcleaned_places_reviews.csv'  
df = pd.read_csv(file_path)

def filter_nouns_verbs_unique(text):
    tokens = word_tokenize(text)  
    tagged = pos_tag(tokens)  
    filtered_words = [word for word, pos in tagged if pos.startswith('NN') or pos.startswith('VB')]  
    unique_words = list(set(filtered_words))  
    return unique_words 

df['latest_reviews'] = df['latest_reviews'].apply(lambda x: filter_nouns_verbs_unique(str(x)))

output_path = '3filtered_reviews_override.csv'  
df.to_csv(output_path, index=False)

print("Filtered reviews saved to", output_path)


Step 05 : This Python script processes a dataset of location-based reviews using the pandas library. It reads data from a CSV file, defines a function to merge and deduplicate review lists, and then aggregates the data by location name. For each location, it retains the first occurrence of latitude, longitude, formatted address, and user ratings total, calculates the mean rating, and applies the review merging function to ensure unique reviews. The cleaned and aggregated dataset is then saved to a new CSV file, with a confirmation message printed to indicate the file path.

In [None]:
file_path = '3filtered_reviews_override.csv'  
df = pd.read_csv(file_path)

def merge_reviews(reviews):
    unique_reviews = set()
    for review_list in reviews:
        review_items = eval(review_list)
        unique_reviews.update(review_items)  
    return list(unique_reviews)

merged_df = df.groupby('name').agg({
    'lat': 'first',  
    'lng': 'first',  
    'formatted_address': 'first',  
    'rating': 'mean',  
    'user_ratings_total': 'first',  
    'latest_reviews': merge_reviews  
}).reset_index()

output_file_path = '4merged_unique_reviews.csv'  
merged_df.to_csv(output_file_path, index=False)

print(f"Cleaned dataset saved to {output_file_path}")


# Preprocessing with Groq API and Llama

This is designed to categorize Sri Lankan tourist destinations based on their latest reviews.The code leverages the LLM's capabilities to automatically categorize Sri Lankan tourist destinations based on their reviews, providing valuable insights for travelers and tourism planners by using Groq API.

In [None]:
df = pd.read_csv('cleaned_name_places_data.csv')

In [None]:
df.head()

In [None]:
def clean_text_column(df, column_name):
  
    df[column_name] = df[column_name].astype(str).str.replace(r'[\[\]"\']', '', regex=True).str.lower()
    return df

In [None]:
clean_text_column(df, 'latest_reviews')
clean_text_column(df, 'name')

<hr>

In [None]:
df = pd.read_csv('visitor.csv')

In [None]:
all_activities = []
for activities_str in df['Preferred Activities']:
    activities_list = ast.literal_eval(activities_str)  
    all_activities.extend(activities_list)  

unique_individual_activities = list(all_activities)
unique_individual_activities

<hr>

In [None]:
df = pd.read_csv('4merged_unique_reviews.csv')

In [None]:
import getpass
import os

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

In [None]:
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            '''You are a helpful assistant that gives the best matching activity category(s) from these for the input Sri Lankan tourist Destinations and their latest reviews
            Activity Categories = ['cycling', 'historical monuments', 'village homestays', 'butterfly watching', 'hot springs', 'wildlife viewing', 'sea cruises', 'themed parks', 'craft workshops', 'fishing', 'sailing', 'history tours', 'literary tours', 'public art installations', 'temple pilgrimages', 'architecture tours', 'golfing', 'hot air ballooning', 'spiritual retreats', 'cultural experiences', 'botanical gardens', 'boat safaris', 'caving', 'cultural festivals', 'museum visits', 'mountain biking', 'camping', 'turtle watching', 'historic walks', 'safaris', 'waterfalls', 'scuba diving', 'elephant rides', 'bird watching', 'ayurvedic spa treatments', 'horse shows', 'traditional ceremonies', 'surfing', 'historic sites', 'art classes', 'city tours', 'theater', 'amusement parks', 'architecture photography', 'beachfront dining', 'kayaking', 'beach visits', 'rock climbing', 'arts and culture', 'snorkeling', 'animal encounters', 'archaeological sites', 'sailing lessons', 'whale watching', 'local crafts', 'yoga retreats', 'paddleboarding', 'horseback riding', 'zip-lining', 'outdoor adventures', 'planetarium visits', 'water parks', 'photography', 'sightseeing', 'tea tasting', 'hiking', 'river cruises', 'landscape photography']

            ONLY OUTPUT THE CATEGORY(S) ONLY DON'T ADD EXPLANATIONS
        ''',
        ),
        ("human", "Destination: {destination}\nLatest Reviews: {reviews}"),
    ]
)



chain = prompt | llm

def process_destination_with_reviews(row):
    result = chain.invoke({
        "destination": row['name'],
        "reviews": row['latest_reviews']
    })
    print(result.content)
    return result.content

df['categoriess'] = df.apply(process_destination_with_reviews, axis=1)

print(df[['name', 'latest_reviews', 'categoriess']].head())

# Optionally, save the updated DataFrame
df.to_csv('updated_destinations_with_categories.csv', index=False)

In [None]:
df.to_csv('places_v7.csv', index=True)

<hr>

In [None]:
df1 = pd.read_csv('places_v3.csv')

In [None]:
df = pd.read_csv('places_v7.csv')

In [None]:
nltk.download('punkt_tab')

In [None]:
afinn = Afinn()

def process_sentence(sentence):
    words = word_tokenize(sentence)
    positive_words = []
    negative_words = []
    neutral_words = []

    for word in words:
        score = afinn.score(word)
        if score > 0:
            positive_words.append(word)
        elif score < 0:
            negative_words.append(word)
        else:
            neutral_words.append(word)

    return len(positive_words), len(negative_words), len(neutral_words)

df[['positive_words', 'negative_words', 'neutral_words']] = df1['latest_reviews'].apply(process_sentence).apply(pd.Series)

print(df)

In [None]:
df = df.drop(columns=['neutral_words'])

In [None]:
df['categories']  = df['categoriess']

In [None]:
df['user_ratings_total'] = df['user_ratings_total'].fillna(1)

In [None]:
df.to_csv('places_v7.csv', index=False) 

<hr>

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('visitor.csv')

print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

print(df.info())

df['Bucket list destinations Sri Lanka'] = df['Bucket list destinations Sri Lanka'].apply(ast.literal_eval)

all_destinations = df['Bucket list destinations Sri Lanka'].explode().unique()
unique_destination_count = len(all_destinations)


print(f"The number of unique destinations in 'Bucket list destinations Sri Lanka' column is: {unique_destination_count}")

print(all_destinations)

all_destinations = ['Polonnaruwa' 'Hatton' 'Anuradhapura' 'Ella' 'Haputale'
 'Madunagala Hot Water Spring' 'Wilpattu National Park'
 'Wasgamuwa National Park' 'Kanneliya National Rain Forest Reserve'
 'Horton Plains National Park' 'Mirissa Beach' 'Negombo Lagoon'
 'Batadombalena Craft Centre' 'Jungle Beach' 'Bentota'
 'Maha Oya Hot Water Springs' 'Colombo Port City' 'Trincomalee Harbour'
 'Kalpitiya' 'Galle Dutch Fort' 'Sigiriya' 'Jaffna Public Library'
 'Colombo' 'Mihintale' 'Dambulla Royal Cave Temple and Golden Temple'
 'Hikkaduwa' 'Nuwara Eliya Golf Club' 'Kandalama' "Sri Pada / Adam's Peak"
 'Seetha Eliya' 'Sri Dalada Maligawa'
 'Seethawaka Wet Zone Botanical Gardens' 'Kandy Temple'
 'Arankelle Forest Monastery' 'Batatotalena (Batadombalena) Cave'
 'Madu River' 'Ritigala' 'Kandy National Museum' 'Folk Museum'
 'Victoria Golf Club' 'Colombo National Museum' 'Meemure' 'Horton Plains'
 'Udawalawe National Park' 'Ratnapura Gem Museum' 'Rekawa Beach'
 'Kandy Lake' 'Galle Fort' 'Anuradapura' 'Polonaruwa' 'Diyaluma Falls'
 'Dunhinda Waterfall' 'Yala National Park' 'Trincomalee'
 "St Clair's Falls" 'Udawalawe' 'Sinharaja Forest Reserve'
 'Kumana National Park' 'Bundala National Park' 'Nallur Kandaswamy Kovil'
 'Kandy' 'Nuwara Eliya' 'Tangalle' 'Weligama Beach (surf and stay)'
 'Ramboda Falls' "Baker's Falls" 'Bomburu Ella Waterfall'
 'Polonnaruwa Ancient City' 'Galle' 'Pinnawala' 'Colombo City Tour'
 'Kandy City Centre' 'Nelung Arts Centre' 'Excel World'
 'Dry Zone Botanic Gardens, Hambantota' 'Unawatuna' 'Knuckles'
 'Kalpitiya Lagoon' 'Bentota River' 'Kitulgala' 'Passikuda Beach'
 'Bentota Beach' 'Marakolliya Beach' 'Bopath Falls' 'Pigeon Island'
 'Hikkaduwa Beach' 'Unawatuna Beach' 'Elephant Transit Home'
 'Leisure World' 'Ruhunu Maha Kataragama Dewalaya' 'Negombo'
 'Tangalle Beach' 'Gangaramaya Temple' 'Minneriya National Park'
 'Kitugala Forest' 'Nilaveli Beach' 'National Gallery of Art'
 'Maritime Museum' 'National Museum Galle' 'Dutch Museum'
 'Mahapelessa Hot Springs' 'Arugam Bay Beach' 'Yapahuwa Rock Fortress'
 'Royal Botanical Gardens, Peradeniya' 'Negambo' 'Royal Colombo Golf Club'
 'Sri Lanka Planetarium' 'Dambulla' 'Hakgala Botanical Garden'
 'Unawatuna Lagoon' 'Mahalenama Cave' 'Kithulgala' 'Water World Lanka'
 'Pearl Bay' 'Nine Arches Bridge' 'Pinnawala Elephant Orphanage'
 'Museum of Modern and Contemporary Art' 'Surathali Ella'
 'Nelum Pokuna Theatre' 'Weligama Beach' 'Belilena Caves'
 'Galle City Tour' 'Bolgoda Lake' 'Ambalangoda Mask Workshop' 'Belihuloya'
 'Perl Bay' 'Ahungalla' 'Nallur Kandaswamy Devasthanam'
 'Velgam Vehera Buddhist Temple' 'Ambuluwawa Tower'
 'Anawilundawa Wetlands' 'Ella Rock' 'Ambalangoda' 'Riverstone Gap'
 'Hikkaduwa Coral Sanctuary' 'Bambarakiri Ella' 'Jaya Sri Maha Bodhi'
 'Ahangama' 'Viharamahadevi Park' 'Pidurangala Rock' 'Galle Lighthouse'
 'Martin Wickramasinghe Folk Museum' ' Laxapana Falls'
 'Ravan Ella Waterfall' 'Wavulpone Cave' 'Lionel Wendt Art Centre'
 'Koggala Beach' 'Kosgoda Turtle Hatchery' 'Devon Falls'
 'Bambarakanda Falls' 'Hiriketiya' 'Vaddha Village Camping'
 'Kanniya Hot Springs' 'Jungle beach' 'Ella Gap' 'Hiriketiya Beach'
 'Uppuveli Beach']

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('places_v7.csv')

famous_destinations = ['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ella', 'Haputale', 'Madunagala Hot Water Spring', 'Wilpattu National Park', 'Wasgamuwa National Park', 'Kanneliya National Rain Forest Reserve', 'Horton Plains National Park', 'Mirissa Beach', 'Negombo Lagoon', 'Batadombalena Craft Centre', 'Jungle Beach', 'Bentota', 'Maha Oya Hot Water Springs', 'Colombo Port City', 'Trincomalee Harbour', 'Kalpitiya', 'Galle Dutch Fort', 'Sigiriya', 'Jaffna Public Library', 'Colombo', 'Mihintale', 'Dambulla Royal Cave Temple and Golden Temple', 'Hikkaduwa', 'Nuwara Eliya Golf Club', 'Kandalama', "Sri Pada / Adam's Peak", 'Seetha Eliya', 'Sri Dalada Maligawa', 'Seethawaka Wet Zone Botanical Gardens', 'Kandy Temple', 'Arankelle Forest Monastery', 'Batatotalena (Batadombalena) Cave', 'Madu River', 'Ritigala', 'Kandy National Museum', 'Folk Museum', 'Victoria Golf Club', 'Colombo National Museum', 'Meemure', 'Horton Plains', 'Udawalawe National Park', 'Ratnapura Gem Museum', 'Rekawa Beach', 'Kandy Lake', 'Galle Fort', 'Anuradapura', 'Polonnaruwa', 'Diyaluma Falls', 'Dunhinda Waterfall', 'Yala National Park', 'Trincomalee', "St Clair's Falls", 'Udawalawe', 'Sinharaja Forest Reserve', 'Kumana National Park', 'Bundala National Park', 'Nallur Kandaswamy Kovil', 'Kandy', 'Nuwara Eliya', 'Tangalle', 'Weligama Beach (surf and stay)', 'Ramboda Falls', "Baker's Falls", 'Bomburu Ella Waterfall', 'Polonnaruwa Ancient City', 'Galle', 'Pinnawala', 'Colombo City Tour', 'Kandy City Centre', 'Nelung Arts Centre', 'Excel World', 'Dry Zone Botanic Gardens, Hambantota', 'Unawatuna', 'Knuckles', 'Kalpitiya Lagoon', 'Bentota River', 'Kitulgala', 'Passikuda Beach', 'Bentota Beach', 'Marakolliya Beach', 'Bopath Falls', 'Pigeon Island', 'Hikkaduwa Beach', 'Unawatuna Beach', 'Elephant Transit Home', 'Leisure World', 'Ruhunu Maha Kataragama Dewalaya', 'Negombo', 'Tangalle Beach', 'Gangaramaya Temple', 'Minneriya National Park', 'Kitugala Forest', 'Nilaveli Beach', 'National Gallery of Art', 'Maritime Museum', 'National Museum Galle', 'Dutch Museum', 'Mahapelessa Hot Springs', 'Arugam Bay Beach', 'Yapahuwa Rock Fortress', 'Royal Botanical Gardens, Peradeniya', 'Negambo', 'Royal Colombo Golf Club', 'Sri Lanka Planetarium', 'Dambulla', 'Hakgala Botanical Garden', 'Unawatuna Lagoon', 'Mahalenama Cave', 'Kithulgala', 'Water World Lanka', 'Pearl Bay', 'Nine Arches Bridge', 'Pinnawala Elephant Orphanage', 'Museum of Modern and Contemporary Art', 'Surathali Ella', 'Nelum Pokuna Theatre', 'Weligama Beach', 'Belilena Caves', 'Galle City Tour', 'Bolgoda Lake', 'Ambalangoda Mask Workshop', 'Belihuloya', 'Perl Bay', 'Ahungalla', 'Nallur Kandaswamy Devasthanam', 'Velgam Vehera Buddhist Temple', 'Ambuluwawa Tower', 'Anawilundawa Wetlands', 'Ella Rock', 'Ambalangoda', 'Riverstone Gap', 'Hikkaduwa Coral Sanctuary', 'Bambarakiri Ella', 'Jaya Sri Maha Bodhi', 'Ahangama', 'Viharamahadevi Park', 'Pidurangala Rock', 'Galle Lighthouse', 'Martin Wickramasinghe Folk Museum', ' Laxapana Falls', 'Ravan Ella Waterfall', 'Wavulpone Cave', 'Lionel Wendt Art Centre', 'Koggala Beach', 'Kosgoda Turtle Hatchery', 'Devon Falls', 'Bambarakanda Falls', 'Hiriketiya', 'Vaddha Village Camping', 'Kanniya Hot Springs', 'Jungle beach', 'Ella Gap', 'Hiriketiya Beach', 'Uppuveli Beach']

df = df[df['name'].isin(famous_destinations)]

print(f"Shape of the filtered dataframe is: {df.shape}")

In [None]:
df.to_csv('places_v8.csv', index=False) 