# Team Neon Genesis - Final Model - Ensemble Learning with KNNClassifiers

## Initial Model

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import joblib


df0 = pd.read_csv('places_v7.csv')
df1 = pd.read_csv('places_v8.csv')

required_columns = ['categories', 'name', 'rating', 'user_ratings_total', 'positive_words', 'negative_words']
for df in [df0, df1]:
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in the CSV file.")

X0 = df0['categories']
y0 = df0['name']

X1 = df1['categories']
y1 = df1['name']

tfidf_vectorizer0 = TfidfVectorizer()
tfidf_vectorizer1 = TfidfVectorizer()

X0_tfidf = tfidf_vectorizer0.fit_transform(X0)
X1_tfidf = tfidf_vectorizer1.fit_transform(X1)

knn_classifier0 = KNeighborsClassifier(n_neighbors=10, metric='cosine')
knn_classifier1 = KNeighborsClassifier(n_neighbors=10, metric='cosine')

knn_classifier0.fit(X0_tfidf, y0)
knn_classifier1.fit(X1_tfidf, y1)

def calculate_score(row):
    rating = row['rating']
    rating_count = row['user_ratings_total']
    positive_count = row['positive_words']
    negative_count = row['negative_words']
    
    score = (
        0.2 * rating +
        0.2 * np.log1p(rating_count) +  # log to dampen the effect of very high counts
        0.5 * (positive_count / (positive_count + negative_count + 1)) +  # sentiment ratio
        0.1 * np.log1p(positive_count + negative_count)  # total review length
    )
    return score

df0['score'] = df0.apply(calculate_score, axis=1)
df1['score'] = df1.apply(calculate_score, axis=1)

scaler0 = MinMaxScaler()
scaler1 = MinMaxScaler()
df0['normalized_score'] = scaler0.fit_transform(df0[['score']])
df1['normalized_score'] = scaler1.fit_transform(df1[['score']])

input_categories = "wildlife, theater, safaris" # Input Categories

category_list = [category.strip() for category in input_categories.split(',')]

def get_verified_top_predictions(category, df, classifier, tfidf_vectorizer):
    category_tfidf = tfidf_vectorizer.transform([category])

    top_10_predictions = classifier.kneighbors(category_tfidf, n_neighbors=10, return_distance=False)[0]

    verified_places = []
    for prediction in top_10_predictions:
        place_row = df.iloc[prediction]
        actual_category = place_row['categories']
        
        if category.lower() in actual_category.lower():
            verified_places.append(place_row)
    
    if len(verified_places) > 0:
        verified_df = pd.DataFrame(verified_places)
        verified_df_sorted = verified_df.sort_values('normalized_score', ascending=False).head(2)
        return verified_df_sorted[['name', 'rating', 'user_ratings_total', 'normalized_score']].to_dict('records')
    
    return []  
final_places_list = []

for category in category_list:
    verified_places_0 = get_verified_top_predictions(category, df0, knn_classifier0, tfidf_vectorizer0)
    verified_places_1 = get_verified_top_predictions(category, df1, knn_classifier1, tfidf_vectorizer1)
    
    final_places = (verified_places_0 + verified_places_1)[:2]
    
    for place in final_places:
        final_places_list.append(place['name'])

print("Final List of Top 5 Places:")
print(final_places_list[:5])

Final List of Top 5 Places:
['Dehiwala Zoological Gardens', 'Udawatta Kele Sanctuary', 'Nelum Pokuna Theatre', 'Nelung Arts Centre', 'Ridiyagama Safari Park']


## Save Model

In [None]:
joblib.dump(knn_classifier0, 'knn_classifier0.pkl')
joblib.dump(knn_classifier1, 'knn_classifier1.pkl')
joblib.dump(tfidf_vectorizer0, 'tfidf_vectorizer0.pkl')
joblib.dump(tfidf_vectorizer1, 'tfidf_vectorizer1.pkl')

## Inference

In [21]:
import joblib
import pandas as pd
import numpy as np


knn_classifier0 = joblib.load('knn_classifier0.pkl')
knn_classifier1 = joblib.load('knn_classifier1.pkl')
tfidf_vectorizer0 = joblib.load('tfidf_vectorizer0.pkl')
tfidf_vectorizer1 = joblib.load('tfidf_vectorizer1.pkl')

df0 = pd.read_csv('places_v7.csv')
df1 = pd.read_csv('places_v8.csv')

def calculate_score(row):
    rating = row['rating']
    rating_count = row['user_ratings_total']
    positive_count = row['positive_words']
    negative_count = row['negative_words']
    
    score = (
        0.2 * rating +
        0.2 * np.log1p(rating_count) +  
        0.5 * (positive_count / (positive_count + negative_count + 1)) +  
        0.1 * np.log1p(positive_count + negative_count)
    )
    return score

df0['score'] = df0.apply(calculate_score, axis=1)
df1['score'] = df1.apply(calculate_score, axis=1)

input_categories = "wildlife, theater, safaris"

category_list = [category.strip() for category in input_categories.split(',')]

def get_verified_top_predictions(category, df, classifier, tfidf_vectorizer):
    category_tfidf = tfidf_vectorizer.transform([category])

    top_10_predictions = classifier.kneighbors(category_tfidf, n_neighbors=10, return_distance=False)[0]

    verified_places = []
    for prediction in top_10_predictions:
        place_row = df.iloc[prediction]
        actual_category = place_row['categories']
        
        if category.lower() in actual_category.lower():
            verified_places.append(place_row)
    
    if len(verified_places) > 0:
        verified_df = pd.DataFrame(verified_places)
        verified_df_sorted = verified_df.sort_values('score', ascending=False).head(2)
        return verified_df_sorted[['name', 'rating', 'user_ratings_total', 'score']].to_dict('records')
    
    return []  

final_places_list = []

for category in category_list:
    verified_places_0 = get_verified_top_predictions(category, df0, knn_classifier0, tfidf_vectorizer0)
    verified_places_1 = get_verified_top_predictions(category, df1, knn_classifier1, tfidf_vectorizer1)
    
    final_places = (verified_places_0 + verified_places_1)[:2]
    
    for place in final_places:
        final_places_list.append(place['name'])

print("Final List of Top 5 Places:")
print(final_places_list[:5])


Final List of Top 5 Places:
['Dehiwala Zoological Gardens', 'Udawatta Kele Sanctuary', 'Nelum Pokuna Theatre', 'Nelung Arts Centre', 'Ridiyagama Safari Park']


## Evaluation

In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df0 = pd.read_csv('places_v7.csv')
df1 = pd.read_csv('places_v8.csv')

df_combined = pd.concat([df0, df1], ignore_index=True)

required_columns = ['categories', 'name', 'rating', 'user_ratings_total', 'positive_words', 'negative_words']
for col in required_columns:
    if col not in df_combined.columns:
        raise ValueError(f"Column '{col}' not found in the CSV file.")

def calculate_score(row):
    rating = row['rating']
    rating_count = row['user_ratings_total']
    positive_count = row['positive_words']
    negative_count = row['negative_words']
    
    score = (
        0.2 * rating +
        0.2 * np.log1p(rating_count) +
        0.5 * (positive_count / (positive_count + negative_count + 1)) +
        0.1 * np.log1p(positive_count + negative_count)
    )
    return score

df_combined['score'] = df_combined.apply(calculate_score, axis=1)

tfidf_vectorizer = TfidfVectorizer()
categories_tfidf = tfidf_vectorizer.fit_transform(df_combined['categories'])

def get_recommendations(input_categories, df, categories_tfidf, tfidf_vectorizer, top_n=5):
    input_tfidf = tfidf_vectorizer.transform([input_categories])
    
    similarities = cosine_similarity(input_tfidf, categories_tfidf).flatten()
    
    df_temp = df.copy()
    df_temp['similarity'] = similarities
    
    df_sorted = df_temp.sort_values(['similarity', 'score'], ascending=[False, False])
    
    top_recommendations = df_sorted.head(top_n)
    
    return top_recommendations[['name', 'categories', 'rating', 'score', 'similarity']]

df_visitor = pd.read_csv('visitor_v2.csv')

def evaluate_recommendations(df, categories_tfidf, tfidf_vectorizer):
    total_similarity = 0
    num_rows = df.shape[0]
    
    for index, row in df.iterrows():
        input_categories = row['Preferred Activities']
        recommendations = get_recommendations(input_categories, df_combined, categories_tfidf, tfidf_vectorizer, top_n=1)
        total_similarity += recommendations['similarity'].values[0]
    
    average_similarity = total_similarity / num_rows
    return average_similarity

average_similarity = evaluate_recommendations(df_visitor, categories_tfidf, tfidf_vectorizer)
print(f"\nAverage Similarity Score: {average_similarity:.4f}")


Average Similarity Score: 0.6608
