In [16]:
import pandas as pd
from pathlib import Path

# Adjust path to current directory if files are not in a 'data' subfolder
DATA_DIR = Path(".")

# Load files
survey = pd.read_csv(DATA_DIR / "FinalYearProject_Survey.csv", low_memory=False)
tripadvisor = pd.read_csv(DATA_DIR / "tripadvisor_hotel_reviews.csv", low_memory=False)

# FIX 1: Add encoding='latin1' to handle special characters
reviews = pd.read_csv(DATA_DIR / "Reviews.csv", low_memory=False, encoding='latin1')

travel_agents = pd.read_csv(DATA_DIR / "sri_lanka_travel_agents.csv", low_memory=False)
tourist_shops = pd.read_csv(DATA_DIR / "sri_lanka_tourist_shops.csv", low_memory=False)

# FIX 2: Correct filename (matches your uploaded file) - Corrected typo from 'ri_lanka' to 'sri_lanka'
aggregated = pd.read_csv(DATA_DIR / "sri_lanka_aggregated_2020_2022.csv", low_memory=False)

enriched40 = pd.read_csv(DATA_DIR / "Tourism_dataset.csv", low_memory=False)

# Quick survey counts
total_responses = len(survey)

# FIX 3: Use the exact column name from the CSV
country_col = '2. What is your country or nationality?'

domestic = survey[survey[country_col].str.contains("Sri Lanka|Sri Lankan|Srilanka|Sinhal", case=False, na=False)]
international = survey[~survey.index.isin(domestic.index)]

print("Total responses:", total_responses)
print("Domestic responses:", len(domestic))
print("International responses:", len(international))

# Save summary
summary = {
    "total_responses": total_responses,
    "domestic": len(domestic),
    "international": len(international)
}
pd.Series(summary).to_csv(DATA_DIR / "survey_summary_counts.csv")

Total responses: 101
Domestic responses: 61
International responses: 40


In [21]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(".")

# FIX 1: Add encoding='latin1' to avoid UnicodeDecodeError in Reviews.csv
reviews = pd.read_csv(DATA_DIR / "Reviews.csv", low_memory=False, encoding='latin1')
tripadvisor = pd.read_csv(DATA_DIR / "tripadvisor_hotel_reviews.csv", low_memory=False)
enriched40 = pd.read_csv(DATA_DIR / "Tourism_dataset.csv", low_memory=False)

def standardize(df, name_cols):
    # Attempt to pick a column for place name
    for c in name_cols:
        if c in df.columns:
            df = df.rename(columns={c: "place_name"})
            break

    # Find lat/lon columns
    lat_cols = [c for c in df.columns if "lat" in c.lower()]
    lon_cols = [c for c in df.columns if "lon" in c.lower() or "lng" in c.lower()]

    if lat_cols:
        df = df.rename(columns={lat_cols[0]: "latitude"})
    if lon_cols:
        df = df.rename(columns={lon_cols[0]: "longitude"})
    return df

# FIX 2: Added correct column names based on your files
tripadvisor = standardize(tripadvisor, name_cols=["Destination", "name", "place", "hotel_name"])
reviews = standardize(reviews, name_cols=["Location_Name", "Location", "place", "location", "name"])
enriched40 = standardize(enriched40, name_cols=["name", "place_name"])

# Quick fix: trim place names
for df in (tripadvisor, reviews, enriched40):
    if "place_name" in df.columns:
        df["place_name"] = df["place_name"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True).str.lower()

print(tripadvisor.head(3))

                place_name District      Timespan  \
0  attidiya bird sanctuary  colombo  3 months ago   
1  attidiya bird sanctuary  colombo  3 months ago   
2  attidiya bird sanctuary  colombo   5 years ago   

                                              Review  
0  spots scenic make ideal dwelling birds creatur...  
1      good place birdwatching different type around  
2  calm peaceful location visit time got separate...  


In [22]:
import pandas as pd
from pathlib import Path
import difflib

DATA_DIR = Path(".")

# 1. Load files
tripadvisor = pd.read_csv(DATA_DIR / "tripadvisor_hotel_reviews.csv", low_memory=False)
enriched40 = pd.read_csv(DATA_DIR / "Tourism_dataset.csv", low_memory=False)
reviews = pd.read_csv(DATA_DIR / "Reviews.csv", low_memory=False, encoding='latin1')

# 2. Standardize column names (ensure 'place_name' exists)
def standardize(df, name_cols):
    for c in name_cols:
        if c in df.columns:
            df = df.rename(columns={c: "place_name"})
            return df
    return df

tripadvisor = standardize(tripadvisor, ["Destination", "name", "place", "hotel_name"])
enriched40 = standardize(enriched40, ["name", "place_name"])
reviews = standardize(reviews, ["Location_Name", "Location", "place", "location", "name"])

# 3. Clean strings (lowercase, strip)
def clean_names(df, name_col="place_name"):
    if name_col in df.columns:
        df[name_col] = df[name_col].astype(str).str.strip().str.lower()
    return df

tripadvisor = clean_names(tripadvisor)
enriched40 = clean_names(enriched40)
reviews = clean_names(reviews)

# 4. Merge Logic
canonical_names = enriched40["place_name"].dropna().unique().tolist()

# Exact Match
tripadvisor_exact = tripadvisor.merge(enriched40, on="place_name", how="left", suffixes=("", "_enr"))

# Identify missing matches
missing_mask = tripadvisor_exact["category"].isna()
print(f"Records missing exact match: {missing_mask.sum()}")

# Fuzzy Match Function using difflib
def get_best_match(name, choices, cutoff=0.75):
    matches = difflib.get_close_matches(name, choices, n=1, cutoff=cutoff)
    return matches[0] if matches else None

# Apply fuzzy matching
updates = []
for idx, row in tripadvisor_exact[missing_mask].iterrows():
    name = row["place_name"]
    if pd.isna(name) or name == "":
        continue

    match = get_best_match(name, canonical_names, cutoff=0.75)
    if match:
        updates.append((idx, match))

print(f"Found {len(updates)} fuzzy matches.")

# Update the DataFrame with fuzzy results
for idx, match_name in updates:
    enr_row = enriched40[enriched40["place_name"] == match_name].iloc[0]
    for col in enriched40.columns:
        tripadvisor_exact.at[idx, col] = enr_row[col]

# Save result
output_file = DATA_DIR / "tripadvisor_enriched_merged.csv"
tripadvisor_exact.to_csv(output_file, index=False)
print(f"Merged tripadvisor saved to: {output_file}")

Records missing exact match: 33219
Found 1507 fuzzy matches.
Merged tripadvisor saved to: tripadvisor_enriched_merged.csv


In [23]:
import pandas as pd
from pathlib import Path
from textblob import TextBlob
import difflib

DATA_DIR = Path(".")

# 1. Load Data
# Canonical list (enriched40)
enriched40 = pd.read_csv(DATA_DIR / "Tourism_dataset.csv", low_memory=False)

# TripAdvisor (merged in step 3) - Has Text, No Rating
tripadvisor_enriched = pd.read_csv(DATA_DIR / "tripadvisor_enriched_merged.csv", low_memory=False)

# Reviews (raw) - Has Ratings
reviews = pd.read_csv(DATA_DIR / "Reviews.csv", low_memory=False, encoding='latin1')

# 2. Standardize Reviews.csv to match canonical place_names
if "Location_Name" in reviews.columns:
    reviews = reviews.rename(columns={"Location_Name": "place_name"})

# Standardize strings
reviews["place_name"] = reviews["place_name"].astype(str).str.strip().str.lower()
enriched40["place_name"] = enriched40["name"].astype(str).str.strip().str.lower()

# Fuzzy Match Reviews to Canonical List
canonical_names = enriched40["place_name"].dropna().unique().tolist()

# Exact match helper
reviews = reviews.merge(enriched40[["place_name"]], on="place_name", how="left", indicator="matched")

# Find unmatched and try fuzzy matching
unmatched_mask = reviews["matched"] == "left_only"
unmatched_names = reviews.loc[unmatched_mask, "place_name"].unique()
match_map = {}

for name in unmatched_names:
    matches = difflib.get_close_matches(name, canonical_names, n=1, cutoff=0.75)
    if matches:
        match_map[name] = matches[0]

# Apply corrections
reviews.loc[unmatched_mask, "place_name"] = reviews.loc[unmatched_mask, "place_name"].map(match_map).fillna(reviews.loc[unmatched_mask, "place_name"])

# 3. Aggregate Ratings (from Reviews.csv)
agg_reviews = reviews.groupby("place_name").agg(
    avg_rating=("Rating", "mean"),
    review_count=("Rating", "count")
).reset_index()

# 4. Aggregate Sentiment (from TripAdvisor)
def get_sentiment(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0.0

if "Review" in tripadvisor_enriched.columns:
    tripadvisor_enriched["sentiment"] = tripadvisor_enriched["Review"].apply(get_sentiment)
    agg_sentiment = tripadvisor_enriched.groupby("place_name").agg(
        avg_sentiment=("sentiment", "mean")
    ).reset_index()
else:
    agg_sentiment = pd.DataFrame(columns=["place_name", "avg_sentiment"])

# 5. Merge everything into Master
master = enriched40.copy()
master = master.merge(agg_reviews, on="place_name", how="left")
master = master.merge(agg_sentiment, on="place_name", how="left")

# Fill NaNs
global_avg_rating = agg_reviews["avg_rating"].mean() if not agg_reviews.empty else 0
master["avg_rating"] = master["avg_rating"].fillna(global_avg_rating)
master["review_count"] = master["review_count"].fillna(0)
master["avg_sentiment"] = master["avg_sentiment"].fillna(0.0)

# 6. Compute Empowerment Signal
max_vendor = master["vendor_count"].max()
if pd.isna(max_vendor) or max_vendor == 0: max_vendor = 1

max_reviews = master["review_count"].max()
if pd.isna(max_reviews) or max_reviews == 0: max_reviews = 1

master["vendor_norm"] = master["vendor_count"] / max_vendor
master["reviews_norm"] = master["review_count"] / max_reviews
master["local_empowerment_index"] = pd.to_numeric(master["local_empowerment_index"], errors='coerce').fillna(0)

# Final Formula
master["local_empowerment_signal"] = (
    0.4 * master["vendor_norm"] +
    0.4 * master["reviews_norm"] +
    0.2 * (master["local_empowerment_index"] / 10.0)
)

# 7. Save
master.to_csv(DATA_DIR / "master_enriched_dataset.csv", index=False)
print("Master enriched dataset saved:", DATA_DIR / "master_enriched_dataset.csv")
print(master[["place_name", "avg_rating", "local_empowerment_signal"]].head())

Master enriched dataset saved: master_enriched_dataset.csv
                    place_name  avg_rating  local_empowerment_signal
0       sigiriya rock fortress    4.645614                  0.634872
1                    ella rock    4.177612                  0.260000
2                  adam's peak    4.177612                  0.300000
3  horton plains national park    4.326425                  0.331966
4  temple of the tooth (kandy)    4.177612                  0.500000


In [26]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

DATA_DIR = Path(".")
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

# Load data
df = pd.read_csv(DATA_DIR / "master_enriched_dataset.csv")

# Clean Target: category
df = df.dropna(subset=["category"])
df["category"] = df["category"].str.lower().str.strip()

# FIX: Correct typo if present (found in inspection: "hPeradeniya eco")
df["category"] = df["category"].replace({"hperadeniya eco": "eco"})

# Features
features = [
    "eco_score", "cultural_score", "vendor_count", "local_empowerment_index",
    "accessibility_score", "danger_level", "avg_rating", "review_count", "avg_sentiment"
]

# Convert crowd_level to numeric
df["crowd_level_num"] = df["crowd_level"].map({"low":1, "medium":2, "high":3})
# Fill missing crowd_levels with median (2)
df["crowd_level_num"] = df["crowd_level_num"].fillna(2)

features.append("crowd_level_num")

# Fill missing numeric features
for f in features:
    df[f] = pd.to_numeric(df[f], errors="coerce").fillna(df[f].median())

X = df[features]
le = LabelEncoder()
y = le.fit_transform(df["category"])
print("Classes identified:", le.classes_)

# Train-test split
# Using stratify=y ensures we keep the same proportion of classes in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale numerics
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Model 1: Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
# Reduced cv to 3 for small dataset
rf_scores = cross_val_score(rf, X_train_s, y_train, cv=StratifiedKFold(3), scoring="accuracy")
print(f"RF CV accuracy mean: {rf_scores.mean():.3f}")

rf.fit(X_train_s, y_train)
rf_test_acc = rf.score(X_test_s, y_test)
print(f"RF test accuracy: {rf_test_acc:.3f}")

# Model 2: Logistic Regression
lr = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', random_state=42)
lr_scores = cross_val_score(lr, X_train_s, y_train, cv=StratifiedKFold(3), scoring="accuracy")
print(f"LR CV accuracy mean: {lr_scores.mean():.3f}")

lr.fit(X_train_s, y_train)
lr_test_acc = lr.score(X_test_s, y_test)
print(f"LR test accuracy: {lr_test_acc:.3f}")

# Choose best
if rf_test_acc >= lr_test_acc:
    best_model = rf
    best_name = "random_forest"
    best_acc = rf_test_acc
else:
    best_model = lr
    best_name = "logistic_regression"
    best_acc = lr_test_acc

# Save model + preprocessing
joblib.dump(best_model, MODEL_DIR / "best_model.pkl")
joblib.dump(scaler, MODEL_DIR / "scaler.pkl")
joblib.dump(le, MODEL_DIR / "label_encoder.pkl")
print(f"Saved best model: {best_name} with test acc {best_acc:.3f}")

Classes identified: ['cultural' 'eco' 'mixed']
RF CV accuracy mean: 0.876
RF test accuracy: 0.875
LR CV accuracy mean: 0.939
LR test accuracy: 0.750
Saved best model: random_forest with test acc 0.875


In [25]:
import pandas as pd
from pathlib import Path

# Adjust path to current directory
DATA_DIR = Path(".")

# Use the actual filename present in your environment
fn = DATA_DIR / "Tourism_dataset.csv"

try:
    df = pd.read_csv(fn, low_memory=False)

    # Add missing columns with sensible defaults
    defaults = {
        "eco_score": 5,
        "cultural_score": 5,
        "vendor_count": 5,
        "local_empowerment_index": 5,
        "accessibility_score": 5,
        "crowd_level": "medium",
        "danger_level": 3
    }

    added_cols = []
    for col, val in defaults.items():
        if col not in df.columns:
            df[col] = val
            added_cols.append(col)

    # Save only if changes were made or to ensure format
    df.to_csv(fn, index=False)

    if added_cols:
        print(f"Added columns: {added_cols}")
    else:
        print("All columns already present. File verified.")

except FileNotFoundError:
    print(f"Error: Could not find file {fn}")

All columns already present. File verified.
