# Namibia Tourism Sentiment Analysis

## 1. Import Necessary Libraries

In [None]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from textblob import Word, TextBlob

import torch
from torch.nn.functional import softmax

import emoji
from cleantext import clean
import preprocessor as p  # tweet-preprocessor

from gensim.parsing.preprocessing import remove_stopwords

from transformers import RobertaTokenizer, RobertaForSequenceClassification

from tqdm import tqdm

# NLTK data download (only if not already downloaded)
def download_nltk_package(package_name, resource_path):
    try:
        nltk.data.find(resource_path)
    except LookupError:
        nltk.download(package_name)

download_nltk_package('vader_lexicon', 'sentiment/vader_lexicon.zip')
download_nltk_package('wordnet', 'corpora/wordnet.zip')
download_nltk_package('omw-1.4', 'corpora/omw-1.4.zip')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

## 2. Data Loading and Initial Preparation

### 2.1. Load Dataset Function

In [None]:
def load_dataset(file_path):
    try:
        # Attempt to load with UTF-8 encoding
        df = pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        # Fallback to ISO-8859-1 encoding if UTF-8 fails
        df = pd.read_csv(file_path, encoding='ISO-8859-1')
    return df

### 2.2. Load and Preview Data|

In [None]:
df = load_dataset('namibia_reviews_tripadvisor.csv')
df.head()

## 3. Data Cleaning and Preprocessing

### 3.1. Column Management

In [None]:
# Drop the first column which appears to be an ID or redundant column
df.drop(columns=[df.columns[0]], inplace=True)

# Create new review Id column
df['Id'] = range(1, len(df) + 1)
first_column = df.pop('Id')
df.insert(0, 'Id', first_column)

### 3.2. Country Standardization

In [None]:
country_mapping = {
    'UK': 'United Kingdom',
    'U.K.': 'United Kingdom',
    'GB': 'United Kingdom',
    'USA': 'United States',
    'U.S.A': 'United States',
    'US': 'United States',
    'DC': 'United States',
    'D.C.': 'United States',
    'CA (USA)': 'United States',
    'CA': 'California, United States',
    'NY': 'New York, United States',
    'TX': 'Texas, United States',
    'FL': 'Florida, United States',
    'NJ': 'New Jersey, United States',
    'PA': 'Pennsylvania, United States',
    'IL': 'Illinois, United States',
    'GA': 'Georgia, United States',
    'OH': 'Ohio, United States',
    'NC': 'North Carolina, United States',
    'MI': 'Michigan, United States',
    'MN': 'Minnesota, United States',
    'AZ': 'Arizona, United States',
    'IN': 'Indiana, United States',
    'WA': 'Washington, United States',
    'WI': 'Wisconsin, United States',
    'OR': 'Oregon, United States',
    'MD': 'Maryland, United States',
    'VA': 'Virginia, United States',
    'MA': 'Massachusetts, United States',
    'CO': 'Colorado, United States',
    'UT': 'Utah, United States',
    'NV': 'Nevada, United States',
    'MO': 'Missouri, United States',
    'AL': 'Alabama, United States',
    'AK': 'Alaska, United States',
    'MT': 'Montana, United States',
    'NE': 'Nebraska, United States',
    'NH': 'New Hampshire, United States',
    'NM': 'New Mexico, United States',
    'ND': 'North Dakota, United States',
    'SD': 'South Dakota, United States',
    'VT': 'Vermont, United States',
    'WY': 'Wyoming, United States',
    'IA': 'Iowa, United States',
    'ID': 'Idaho, United States',
    'ME': 'Maine, United States',
    'LA': 'Louisiana, United States',
    'DE': 'Delaware, United States',
    'AR': 'Arkansas, United States',
    'MS': 'Mississippi, United States',
    'OK': 'Oklahoma, United States',
    'KS': 'Kansas, United States',
    'KY': 'Kentucky, United States',
    'RI': 'Rhode Island, United States',
    'SC': 'South Carolina, United States',
    'TN': 'Tennessee, United States',
    'CT': 'Connecticut, United States',
    'AB': 'Alberta, Canada',
    'ON': 'Ontario, Canada',
    'BC': 'British Columbia, Canada',
    'NSW': 'New South Wales, Australia',
    'NZ': 'New Zealand',
    'UAE': 'United Arab Emirates',
    'DRC': 'Democratic Republic of the Congo',
    'SA': 'South Africa',
    'S A': 'South Africa',
    'TLV': 'Israel',
}

# Extract and standardize country names
df['tourist_country'] = df['tourist_country'].str.split(',').str[-1].str.strip()
df['tourist_country'] = df['tourist_country'].replace(country_mapping)

### 3.3. Missing Value Handling

In [None]:
print("Missing values before handling:")
print(df.isnull().sum())

# Fill missing tourist types
df['tourist_type'] = df['tourist_type'].fillna('Unknown')

print("\nMissing values after handling:")
print(df.isnull().sum())

### 3.4. Date Processing

In [None]:
# Convert to datetime and extract temporal features
df['visit_date'] = pd.to_datetime(df['visit_date'], format='%d-%b-%y', errors='coerce')
df['review_date'] = pd.to_datetime(df['review_date'], format='%d-%b-%y', errors='coerce')

# Extract temporal features
df["WeekDay"] = df['visit_date'].dt.day_name()
df['WeekDay'] = pd.Categorical(df['WeekDay'], 
                               categories=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 
                                          'Thursday', 'Friday', 'Saturday'],
                               ordered=True)

df["Month"] = df['visit_date'].dt.month_name()
df['Month'] = pd.Categorical(df['Month'], 
                             categories=['January', 'February', 'March', 'April', 'May', 'June',
                                        'July', 'August', 'September', 'October', 'November', 'December'],
                             ordered=True)

df["Year"] = df['visit_date'].dt.year.astype('Int64')

### 3.5. Text Preprocessing

In [None]:
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def preprocess_reviews(row): 
    text = row['reviews']
    text = remove_emojis(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove digits
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    text = remove_stopwords(text)
    return text

# Apply preprocessing
df['PreprocessedReviews'] = df.apply(preprocess_reviews, axis=1)
df['CleanedReviews'] = df['PreprocessedReviews'].apply(clean_text)

### 3.6. Save Cleaned Data

In [None]:
clean_df = df.copy()
clean_df.head()

## 4. Exploratory Data Analysis (EDA)

### 4.1. Tourist Demographics

In [None]:
plt.figure(figsize=(10, 6))
top_countries = clean_df['tourist_country'].value_counts().nlargest(10).reset_index()
top_countries.columns = ['Country', 'ReviewCount']
sns.barplot(data=top_countries, y='Country', x='ReviewCount', palette='plasma')
plt.title('Top 10 Countries Represented in Namibia Reviews')
plt.show()

### 4.2. Temporal Patterns

In [None]:
# Annual trends
plt.figure(figsize=(10, 6))
yearly_reviews = clean_df['Year'].value_counts().sort_index()
yearly_reviews.plot(kind='bar', color=sns.color_palette("husl", len(yearly_reviews)))
plt.title('Distribution of Reviews Over the Years')
plt.show()

# Monthly and weekday patterns
def plot_visits_by_day_and_month(df):
    plt.figure(figsize=(16, 6))
    plt.subplot(1, 2, 1)
    sns.countplot(y='WeekDay', data=df, 
                  order=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 
                         'Thursday', 'Friday', 'Saturday'],
                  palette='coolwarm')
    plt.title('Visits by Day of Week')
    
    plt.subplot(1, 2, 2)
    sns.countplot(y='Month', data=df,
                  order=['January', 'February', 'March', 'April', 'May', 'June',
                         'July', 'August', 'September', 'October', 'November', 'December'],
                  palette='husl')
    plt.title('Visits by Month')
    plt.tight_layout()
    plt.show()

plot_visits_by_day_and_month(clean_df)

### 4.3. Tourist Types Analysis

In [None]:
plt.figure(figsize=(10, 6))
type_counts = df['tourist_type'].value_counts(normalize=True) * 100
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
type_counts.plot(kind='bar', color=colors, width=0.7)
plt.title('Frequency of Tourist Types')
plt.ylabel('Percentage (%)')
plt.show()

### 4.4. Popular Destinations

In [None]:
plt.figure(figsize=(8, 8))
top5_places = clean_df['reviewed_place'].value_counts().nlargest(5)
top5_places.plot.pie(autopct='%1.1f%%', startangle=140, colors=sns.color_palette("coolwarm", 5))
plt.title('Top 5 Most Reviewed Tourist Places')
plt.show()

## 5. Sentiment Analysis

### 5.1. VADER Sentiment Analysis

In [None]:
# Get polarity scores
res = {}
for i, row in tqdm(df.iterrows(), total=len(clean_df)):
    text = row['PreprocessedReviews'] 
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

# Merge results
clean_df = pd.DataFrame(res).T
clean_df = clean_df.reset_index().rename(columns={'index': 'Id'})
clean_df = clean_df.merge(df, how='left')

# Classify sentiments
def GetAnalysis(score):
    if score < 0:
        return "Negative"
    elif score == 0:
        return "Neutral"
    else:
        return "Positive"

clean_df['VaderAnalysis'] = clean_df['compound'].apply(GetAnalysis)

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
ax = sns.countplot(data=clean_df, x='VaderAnalysis', palette='Set2')
plt.title('VADER Sentiment Distribution')
plt.show()

### 5.2. RoBERTa Sentiment Analysis

In [None]:
# Load RoBERTa model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Sentiment scoring function
def polarity_scores_roberta(text):
    encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    probs = softmax(scores)
    return {
        'roberta_neg': probs[0],
        'roberta_neu': probs[1],
        'roberta_pos': probs[2]
    }

# Apply RoBERTa to dataset
res = {}
for i, row in tqdm(clean_df.iterrows(), total=len(clean_df)):
    try:
        text = row['PreprocessedReviews']
        myid = row['Id']
        vader_results = sia.polarity_scores(text)
        vader_scores = {f"vader_{k}": v for k, v in vader_results.items()}
        roberta_scores = polarity_scores_roberta(text)
        res[myid] = {**vader_scores, **roberta_scores}
    except Exception as e:
        print(f"Failed for id{row['Id']}: {e}")

# Create results DataFrame
results_df = pd.DataFrame.from_dict(res, orient='index').reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(clean_df, on='Id', how='left')

# Classify RoBERTa sentiments
def GetRobertaAnalysis(row):
    scores = [row['roberta_neg'], row['roberta_neu'], row['roberta_pos']]
    idx = np.argmax(scores)
    return ['Negative', 'Neutral', 'Positive'][idx]

results_df['RoBERTaAnalysis'] = results_df.apply(GetRobertaAnalysis, axis=1)

### 5.3. Model Comparison

In [None]:
# Plot comparison
plt.figure(figsize=(16, 5))
plt.subplot(1, 2, 1)
ax1 = sns.countplot(y="VaderAnalysis", data=results_df, palette={"Positive": "#2ecc71", "Negative": "#e74c3c", "Neutral": "#f1c40f"})
plt.title("VADER Sentiment Distribution")

plt.subplot(1, 2, 2)
ax2 = sns.countplot(y="Rob_Analysis", data=results_df, palette={"Positive": "#27ae60", "Negative": "#c0392b", "Neutral": "#f39c12"})
plt.title("RoBERTa Sentiment Distribution")
plt.tight_layout()
plt.show()

## 6. Advanced Analysis

### 6.1. Temporal Sentiment Trends

In [None]:
# Monthly sentiment trends
month_abbr = {
    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
}
results_df['ShortMonth'] = results_df['Month'].map(month_abbr)

monthly_sent = results_df.groupby(['ShortMonth', 'RoBERTaAnalysis']).size().reset_index(name='count')
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_sent['ShortMonth'] = pd.Categorical(monthly_sent['ShortMonth'], categories=month_order, ordered=True)

plt.figure(figsize=(10, 5))
sns.lineplot(data=monthly_sent, x='ShortMonth', y='count', hue='RoBERTaAnalysis', marker='o')
plt.title('Monthly Sentiment Trend (RoBERTa)')
plt.show()

### 6.2. COVID Impact Analysis

In [None]:
post_covid_df = results_df[results_df['Year'] >= 2019]
post_covid_sentiment = post_covid_df.groupby(['Year', 'RoBERTaAnalysis']).size().unstack().fillna(0)
post_covid_sentiment.plot(figsize=(10, 6), marker='o')
plt.axvline(x=2020, color='red', linestyle='--', label='COVID-19 Onset')
plt.title('Post-COVID Sentiment Trends')
plt.show()

### 6.3. Word Clouds by Sentiment

In [None]:
from wordcloud import WordCloud

def get_wordcloud(text, title, color='viridis'):
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=color).generate(" ".join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.tight_layout()
    plt.show()

for sentiment in ['Positive', 'Neutral', 'Negative']:
    filtered = results_df[results_df['RoBERTaAnalysis'] == sentiment]['CleanedReviews']
    get_wordcloud(filtered, f'Most Frequent Words in {sentiment} Reviews')

## 7. Model Evaluation

### 7.1. Classification Reports

In [None]:
from sklearn.metrics import classification_report

print("VADER Evaluation:")
print(classification_report(results_df['VaderAnalysis'], results_df['VaderAnalysis']))

print("\nRoBERTa Evaluation:")
print(classification_report(results_df['VaderAnalysis'], results_df['RoBERTaAnalysis']))

### 7.2. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

vader_cm = confusion_matrix(results_df['VaderAnalysis'], results_df['VaderAnalysis'], 
                           labels=['Negative', 'Neutral', 'Positive'])
roberta_cm = confusion_matrix(results_df['VaderAnalysis'], results_df['RoBERTaAnalysis'], 
                             labels=['Negative', 'Neutral', 'Positive'])

disp_vader = ConfusionMatrixDisplay(confusion_matrix=vader_cm,
                                  display_labels=['Negative', 'Neutral', 'Positive'])
disp_vader.plot(ax=axes[0], cmap='Blues')
axes[0].set_title("VADER Confusion Matrix")

disp_roberta = ConfusionMatrixDisplay(confusion_matrix=roberta_cm,
                                    display_labels=['Negative', 'Neutral', 'Positive'])
disp_roberta.plot(ax=axes[1], cmap='Oranges')
axes[1].set_title("RoBERTa Confusion Matrix")

plt.tight_layout()
plt.show()

## 8. Save Results

In [None]:
results_df.to_csv('namibia_tourism_sentiment_analysis_results.csv', index=False)
print("Analysis results saved successfully!")