# recommend_exhibitors_by_answers

In [5]:
import pandas as pd

In [7]:


visitors_final=pd.read_csv("final_analysis_data_visitors.csv")
exhibitor_final=pd.read_csv("exhibitor_final.csv")

In [9]:
visitors_final.head(2)

Unnamed: 0,visitor_id,email,gender,question,answer,answer.1,answerType
0,67b70a9f2d21f543a1096602,emilija@bss.mk,F,Reason for Attending the Event,To obtain general information,To obtain general information,Answer
1,67b70a9f2d21f543a1096602,emilija@bss.mk,F,Which of the following best describes your job...,Media,Media,Answer


In [13]:
exhibitor_final.head(2)

Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,90556,Turkey Travels,52276,1.5 Resort hotel
1,90556,Turkey Travels,52280,2.1 Inbound tour operator


In [16]:
# Extract unique category names
category_texts = exhibitor_final['categoryName'].dropna().unique()

# Extract unique visitor answers
answer_texts = visitors_final['answer'].dropna().unique()


In [19]:
# Clean up and deduplicate category names
category_set = set()
for cat_string in category_texts:
    for cat in str(cat_string).split(','):
        category_set.add(cat.strip().lower())

category_list = list(category_set)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Lowercase and clean answer texts
answer_list = [str(ans).strip().lower() for ans in answer_texts]

# Combine all text for fitting vectorizer
all_text = answer_list + category_list

# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Split TF-IDF into answers and categories
answer_vecs = tfidf_matrix[:len(answer_list)]
category_vecs = tfidf_matrix[len(answer_list):]

# Cosine similarity
similarity_matrix = cosine_similarity(answer_vecs, category_vecs)


In [22]:
# For each answer, get top matching category
top_matches = []
for i, ans in enumerate(answer_list):
    sims = similarity_matrix[i]
    best_idx = sims.argmax()
    top_category = category_list[best_idx]
    top_score = sims[best_idx]
    top_matches.append({'Answer': ans, 'Matched_Category': top_category, 'Score': round(top_score, 4)})

mapping_df = pd.DataFrame(top_matches)


In [24]:
mapping_df

Unnamed: 0,Answer,Matched_Category,Score
0,to obtain general information,12.1 tic: travel information centre,0.2292
1,media,15.2 online media,0.5774
2,travel agent,3. travel agencies,0.3103
3,no influence,5.3 railway company,0.0
4,to source products and services,8.6 medical products and machinery manufacture...,0.301
5,sales,5.3 railway company,0.0
6,event management,11. mice and event management,0.6552
7,sole responsibility,5.3 railway company,0.0
8,5 - 10 million rubles,10.5 zoo,0.286
9,to promote products and services,8.6 medical products and machinery manufacture...,0.301


In [27]:
mapping_df.to_csv('mapping_answers_to_categories.csv', index=False)


# Testing

In [33]:

def recommend_exhibitors_by_answers(visitor_id, top_n=5):
    visitor_answers = visitors_final[visitors_final['visitor_id'] == visitor_id]['answer'].dropna().unique()
    visitor_answers = [str(ans).strip().lower() for ans in visitor_answers]
    
    matched_results = []
    for ans in visitor_answers:
        if ans in answer_list:
            ans_index = answer_list.index(ans)
            sims = similarity_matrix[ans_index]
            top_indices = sims.argsort()[::-1][:top_n]
            
            for idx in top_indices:
                matched_category = category_list[idx]
                exhibitors = exhibitor_final[exhibitor_final['categoryName'].str.lower().str.contains(matched_category, na=False)]
                for _, row in exhibitors.iterrows():
                    matched_results.append({
                        'visitor_id': visitor_id,
                        'visitor_answer': ans,
                        'matched_category': matched_category,
                        'similarity_score': round(sims[idx], 4),
                        'exhibitorName': row['Name'],
                        'categoryName': row['categoryName']
                    })
    
    return pd.DataFrame(matched_results)



In [35]:
#Run Test
print("Running test for: recommend_exhibitors_by_answers")

visitor_id = visitors_final['visitor_id'].dropna().astype(str).iloc[0]
recommendations_answers = recommend_exhibitors_by_answers(visitor_id, top_n=5)

assert not recommendations_answers.empty, "❌ No results for valid visitor ID with answers"
assert 'similarity_score' in recommendations_answers.columns, "❌ Missing similarity_score column"

print("✅ Passed recommend_exhibitors_by_answers")


Running test for: recommend_exhibitors_by_answers
✅ Passed recommend_exhibitors_by_answers
