In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import json
from datetime import datetime

print("="*60)
print("TOURISM RECOMMENDATION SYSTEM - MODEL TRAINING")
print("="*60)

TOURISM RECOMMENDATION SYSTEM - MODEL TRAINING


In [12]:
print("\n[STEP 1] Loading dataset...")
places = pd.read_csv('clean_place_for_ml.csv', encoding='latin1')

print(f"✓ Dataset loaded successfully!")
print(f"  - Total places: {len(places)}")
print(f"  - Columns: {list(places.columns)}")
print(f"\nDataset Preview:")
print(places.head())

print("\nDataset Statistics:")
print(places.describe())

print("\nMissing Values:")
print(places.isnull().sum())


[STEP 1] Loading dataset...
✓ Dataset loaded successfully!
  - Total places: 3987
  - Columns: ['name', 'province_id', 'province_name', 'category_id', 'category_name', 'ratings', 'reviews_count']

Dataset Preview:
                          name  province_id province_name  category_id  \
0     Royal Palace of Cambodia            1    Phnom Penh            1   
1   Tuol Sleng Genocide Museum            1    Phnom Penh            1   
2          Wat Phnom Daun Penh            1    Phnom Penh            1   
3  National Museum of Cambodia            1    Phnom Penh            1   
4        Independence Monument            1    Phnom Penh            1   

        category_name  ratings  reviews_count  
0  Tourist Attraction      4.3        12506.0  
1  Tourist Attraction      4.6        11655.0  
2  Tourist Attraction      4.4         8646.0  
3  Tourist Attraction      4.1         6478.0  
4  Tourist Attraction      4.5         4894.0  

Dataset Statistics:
       province_id  category_id

In [13]:
print("\n[STEP 2] Preprocessing data...")

places_original = places.copy()
place_df = places.copy()

columns_to_drop = ['category_name', 'province_name', 'province_id']
place_df = place_df.drop(columns_to_drop, axis=1)
place_df = place_df.set_index('name')

print(f"✓ Preprocessed data shape: {place_df.shape}")
print(f"  Features used: {list(place_df.columns)}")
print("\nPreprocessed Data Sample:")
print(place_df.head())


[STEP 2] Preprocessing data...
✓ Preprocessed data shape: (3987, 3)
  Features used: ['category_id', 'ratings', 'reviews_count']

Preprocessed Data Sample:
                             category_id  ratings  reviews_count
name                                                            
Royal Palace of Cambodia               1      4.3        12506.0
Tuol Sleng Genocide Museum             1      4.6        11655.0
Wat Phnom Daun Penh                    1      4.4         8646.0
National Museum of Cambodia            1      4.1         6478.0
Independence Monument                  1      4.5         4894.0


In [14]:
print("\n[STEP 3] Feature scaling...")

place_scaled = place_df.copy()
numeric_cols = place_scaled.select_dtypes(include=['number']).columns

scaler = MinMaxScaler()
place_scaled[numeric_cols] = scaler.fit_transform(place_scaled[numeric_cols])

print(f"✓ Features scaled to [0, 1] range")
print("\nScaled Data Sample:")
print(place_scaled.head())

print("\nScaling Parameters:")
for col in numeric_cols:
    min_val = place_df[col].min()
    max_val = place_df[col].max()
    print(f"  {col}: min={min_val:.2f}, max={max_val:.2f}")


[STEP 3] Feature scaling...
✓ Features scaled to [0, 1] range

Scaled Data Sample:
                             category_id  ratings  reviews_count
name                                                            
Royal Palace of Cambodia             0.0     0.86       0.298188
Tuol Sleng Genocide Museum           0.0     0.92       0.277897
Wat Phnom Daun Penh                  0.0     0.88       0.206152
National Museum of Cambodia          0.0     0.82       0.154459
Independence Monument                0.0     0.90       0.116691

Scaling Parameters:
  category_id: min=1.00, max=7.00
  ratings: min=0.00, max=5.00
  reviews_count: min=0.00, max=41940.00


In [15]:
print("\n[STEP 4] Computing cosine similarity matrix (training the model)...")

cosine_sim_matrix = cosine_similarity(place_scaled)

print(f"✓ Similarity matrix computed!")
print(f"  Matrix shape: {cosine_sim_matrix.shape}")
print(f"  Total similarity scores: {cosine_sim_matrix.size:,}")

cosine_df = pd.DataFrame(
    cosine_sim_matrix, 
    index=place_scaled.index, 
    columns=place_scaled.index
)

print("\nSimilarity Matrix Sample:")
print(cosine_df.iloc[:5, :5])


[STEP 4] Computing cosine similarity matrix (training the model)...
✓ Similarity matrix computed!
  Matrix shape: (3987, 3987)
  Total similarity scores: 15,896,169

Similarity Matrix Sample:
name                         Royal Palace of Cambodia  \
name                                                    
Royal Palace of Cambodia                     1.000000   
Tuol Sleng Genocide Museum                   0.999184   
Wat Phnom Daun Penh                          0.994634   
National Museum of Cambodia                  0.989130   
Independence Monument                        0.979097   

name                         Tuol Sleng Genocide Museum  Wat Phnom Daun Penh  \
name                                                                           
Royal Palace of Cambodia                       0.999184             0.994634   
Tuol Sleng Genocide Museum                     1.000000             0.998001   
Wat Phnom Daun Penh                            0.998001             1.000000   
Nationa

In [16]:
print("\n[STEP 5] Validating model...")

test_place = 'Royal Palace of Cambodia'
similar_places = cosine_df[test_place].sort_values(ascending=False)[1:6]

print(f"\nTest Query: Top 5 similar places to '{test_place}':")
for idx, (place_name, score) in enumerate(similar_places.items(), 1):
    print(f"  {idx}. {place_name} (similarity: {score:.4f})")


[STEP 5] Validating model...

Test Query: Top 5 similar places to 'Royal Palace of Cambodia':
  1. Bayon Temple (similarity: 1.0000)
  2. Bayon Temple (similarity: 1.0000)
  3. Phnom Penh Night Market (similarity: 0.9999)
  4. Phnom Penh Night Market (similarity: 0.9999)
  5. Ta Prohm Temple (similarity: 0.9998)


In [22]:
print("\n[STEP 6] Creating recommendation function...")


def get_recommendations(place_name, top_n=10, min_similarity=0.0):
    """Get top N recommendations for a given place"""
    if place_name not in cosine_df.index:
        return f"Place '{place_name}' not found in dataset"

    # Get similarity scores as a Series using .loc to access row (avoids duplicate column issues)
    # Handle duplicate place names by taking first match if index has duplicates
    matching_indices = cosine_df.index[cosine_df.index == place_name]
    if len(matching_indices) > 0:
        sim_scores = cosine_df.loc[matching_indices[0], :]
    else:
        return f"Place '{place_name}' not found in dataset"
    
    # Ensure we have a Series (should already be one, but be safe)
    if isinstance(sim_scores, pd.DataFrame):
        sim_scores = sim_scores.iloc[0]  # Get first row if DataFrame
    
    sim_scores = sim_scores.sort_values(ascending=False)
    sim_scores = sim_scores[sim_scores > min_similarity]
    sim_scores = sim_scores[sim_scores.index != place_name]
    top_places = sim_scores.head(top_n)

    recommendations = places_original[places_original['name'].isin(top_places.index)].copy()
    top_places_unique = top_places[~top_places.index.duplicated(keep='first')]
    recommendations['similarity_score'] = recommendations['name'].map(top_places_unique)
    recommendations = recommendations.sort_values('similarity_score', ascending=False)

    return recommendations[['name', 'province_name', 'category_name', 'ratings', 
                           'reviews_count', 'similarity_score']]

# Test the function
print("\nTesting recommendation function:")
test_recs = get_recommendations('Wat Phnom', top_n=5)
print(test_recs)


[STEP 6] Creating recommendation function...

Testing recommendation function:
Place 'Wat Phnom' not found in dataset


In [18]:
print("\n[STEP 7] Saving trained model and artifacts...")

model_package = {
    'similarity_matrix': cosine_df,
    'places_data': places_original,
    'feature_data': place_scaled,
    'scaler': scaler,
    'feature_columns': list(numeric_cols),
    'metadata': {
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'n_places': len(places),
        'features_used': list(numeric_cols),
        'model_type': 'Content-Based Filtering (Cosine Similarity)'
    }
}

with open('cbf_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

cosine_df.to_csv('similarity_matrix.csv')

with open('model_metadata.json', 'w') as f:
    json.dump(model_package['metadata'], f, indent=2)

print(f"✓ Model saved as 'cbf_model.pkl'")
print(f"✓ Similarity matrix saved as 'similarity_matrix.csv'")
print(f"✓ Metadata saved as 'model_metadata.json'")


[STEP 7] Saving trained model and artifacts...


✓ Model saved as 'cbf_model.pkl'
✓ Similarity matrix saved as 'similarity_matrix.csv'
✓ Metadata saved as 'model_metadata.json'


In [19]:
print("\n[STEP 8] Model Statistics:")
print("="*60)

print(f"\nDataset Statistics:")
print(f"  Total places: {len(places)}")
print(f"  Provinces: {places['province_name'].nunique()}")
print(f"  Categories: {places['category_name'].nunique()}")

print(f"\nCategory Distribution:")
category_dist = places['category_name'].value_counts()
for cat, count in category_dist.items():
    print(f"  {cat}: {count}")

print(f"\nRating Statistics:")
print(f"  Mean: {places['ratings'].mean():.2f}")
print(f"  Median: {places['ratings'].median():.2f}")
print(f"  Min: {places['ratings'].min():.2f}")
print(f"  Max: {places['ratings'].max():.2f}")

print(f"\nSimilarity Matrix Statistics:")
sim_values = cosine_sim_matrix[np.triu_indices_from(cosine_sim_matrix, k=1)]
print(f"  Mean similarity: {sim_values.mean():.4f}")
print(f"  Median similarity: {np.median(sim_values):.4f}")
print(f"  Min similarity: {sim_values.min():.4f}")
print(f"  Max similarity: {sim_values.max():.4f}")


[STEP 8] Model Statistics:

Dataset Statistics:
  Total places: 3987
  Provinces: 26
  Categories: 4

Category Distribution:
  Restaurant: 1499
  Transportation: 1004
  Hotel: 998
  Tourist Attraction: 486

Rating Statistics:
  Mean: 3.90
  Median: 4.30
  Min: 0.00
  Max: 5.00

Similarity Matrix Statistics:
  Mean similarity: 0.8135
  Median similarity: 0.9402
  Min similarity: 0.0000
  Max similarity: 1.0000


In [20]:
print("\n[STEP 9] Example Recommendation Queries:")
print("="*60)

test_places = ['Royal Palace of Cambodia', 'Tuol Sleng Genocide Museum']
available_test_places = [p for p in test_places if p in places['name'].values]

for place in available_test_places:
    print(f"\n{'='*60}")
    print(f"Recommendations for: {place}")
    print('='*60)
    recs = get_recommendations(place, top_n=5)
    if isinstance(recs, str):
        print(recs)
    else:
        print(recs.to_string(index=False))


[STEP 9] Example Recommendation Queries:

Recommendations for: Royal Palace of Cambodia
                   name province_name      category_name  ratings  reviews_count  similarity_score
           Bayon Temple     Siem Reap Tourist Attraction      4.8        13512.0          0.999951
           Bayon Temple         Takeo Tourist Attraction      4.8        13514.0          0.999951
Phnom Penh Night Market    Phnom Penh Tourist Attraction      4.0        11008.0          0.999860
Phnom Penh Night Market        Kandal Tourist Attraction      4.0        11008.0          0.999860
        Ta Prohm Temple     Siem Reap Tourist Attraction      4.8        13018.0          0.999779
        Ta Prohm Temple  Preah Vihear Tourist Attraction      4.8        13018.0          0.999779
        Ta Prohm Temple         Takeo Tourist Attraction      4.5          682.0          0.999779
        Ta Prohm Temple         Takeo Tourist Attraction      4.8        13018.0          0.999779

Recommendations for

In [21]:
print("\n[STEP 10] Demo: Loading and Using Trained Model:")
print("="*60)

with open('cbf_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

print(f"✓ Model loaded successfully!")
print(f"  Training date: {loaded_model['metadata']['training_date']}")
print(f"  Model type: {loaded_model['metadata']['model_type']}")

def predict_recommendations(place_name, model_package, top_n=10):
    """Make predictions using loaded model"""
    sim_matrix = model_package['similarity_matrix']
    places_data = model_package['places_data']
    
    if place_name not in sim_matrix.index:
        return f"Place '{place_name}' not found"
    
    # Get similarity scores as a Series using .loc to access row (avoids duplicate column issues)
    # Handle duplicate place names by taking first match if index has duplicates
    matching_indices = sim_matrix.index[sim_matrix.index == place_name]
    if len(matching_indices) > 0:
        sim_scores = sim_matrix.loc[matching_indices[0], :]
    else:
        return f"Place '{place_name}' not found"
    
    # Ensure we have a Series (should already be one, but be safe)
    if isinstance(sim_scores, pd.DataFrame):
        sim_scores = sim_scores.iloc[0]  # Get first row if DataFrame
    
    # Sort and get top N (excluding the place itself)
    scores = sim_scores.sort_values(ascending=False)
    scores = scores[scores.index != place_name]
    scores = scores.head(top_n)
    
    # Handle duplicate indices in scores by taking first occurrence
    scores_unique = scores[~scores.index.duplicated(keep='first')]
    
    recs = places_data[places_data['name'].isin(scores_unique.index)].copy()
    recs['similarity_score'] = recs['name'].map(scores_unique)
    recs = recs.sort_values('similarity_score', ascending=False)
    
    return recs[['name', 'province_name', 'category_name', 'ratings', 
                'reviews_count', 'similarity_score']]

if available_test_places:
    test_prediction = predict_recommendations(available_test_places[0], loaded_model, top_n=3)
    print(f"\nTop 3 recommendations for '{available_test_places[0]}':")
    print(test_prediction.to_string(index=False))

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)


[STEP 10] Demo: Loading and Using Trained Model:
✓ Model loaded successfully!
  Training date: 2025-12-21 15:23:59
  Model type: Content-Based Filtering (Cosine Similarity)

Top 3 recommendations for 'Royal Palace of Cambodia':
                   name province_name      category_name  ratings  reviews_count  similarity_score
           Bayon Temple     Siem Reap Tourist Attraction      4.8        13512.0          0.999951
           Bayon Temple         Takeo Tourist Attraction      4.8        13514.0          0.999951
Phnom Penh Night Market    Phnom Penh Tourist Attraction      4.0        11008.0          0.999860
Phnom Penh Night Market        Kandal Tourist Attraction      4.0        11008.0          0.999860

MODEL TRAINING COMPLETE!
