##**Semantic Links**

In [None]:
!pip install -U sentence-transformers

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('/content/preprocessed_english_reviews.csv')   #Load data

incentive_reviews = df[df['Incentivized'] == 'Incentive']['preprocessed_CombinedString'].tolist()
noincentive_reviews = df[df['Incentivized'] == 'NoIncentive']['preprocessed_CombinedString'].tolist()

model = SentenceTransformer('bert-base-nli-mean-tokens')

incentive_embeddings = model.encode(incentive_reviews, show_progress_bar=True)
noincentive_embeddings = model.encode(noincentive_reviews, show_progress_bar=True)

avg_incentive_embedding = sum(incentive_embeddings) / len(incentive_embeddings)
avg_noincentive_embedding = sum(noincentive_embeddings) / len(noincentive_embeddings)

similarity = cosine_similarity([avg_incentive_embedding], [avg_noincentive_embedding])
print(f"Cosine Similarity between 'Incentive' and 'NoIncentive' reviews for combined strings: {similarity[0][0]}")

##**Recommendation and Evaluation**

In [None]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score  # Ensure these are imported

#Load data
df = pd.read_csv('', encoding='utf-8')

#Split the data
main_data, ground_truth_data = train_test_split(df, test_size=0.4, random_state=42)

#Filter 'NoIncentive' reviews
main_data_no_incentive = main_data[main_data['Incentivized'] == 'NoIncentive']
grouped_main_reviews = main_data_no_incentive.groupby('listing_id')['preprocessed_CombinedString'].apply(list)

ground_truth_no_incentive = ground_truth_data[ground_truth_data['Incentivized'] == 'NoIncentive']
grouped_ground_truth_reviews = ground_truth_no_incentive.groupby('listing_id')['preprocessed_CombinedString'].apply(list)

#Initialize the model
model = SentenceTransformer('bert-base-nli-mean-tokens')

#Embeddings for the main dataset
grouped_main_embeddings = {}
for listing_id, reviews in grouped_main_reviews.items():
    grouped_main_embeddings[listing_id] = model.encode(reviews, show_progress_bar=True)

#Embeddings for the ground truth dataset
grouped_ground_truth_embeddings = {}
for listing_id, reviews in grouped_ground_truth_reviews.items():
    grouped_ground_truth_embeddings[listing_id] = model.encode(reviews, show_progress_bar=True)

#Preprocess query
def preprocess_query(query):
    return query.lower()

#Embedding of the user query
def get_query_embedding(query, model):
    preprocessed_query = preprocess_query(query)
    return model.encode([preprocessed_query])[0]

def find_similar_reviews(query_embedding, grouped_embeddings, top_k=5):
    all_results = []
    for listing_id, embeddings in grouped_embeddings.items():
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]
        for index in top_indices:
            all_results.append((index, similarities[index], listing_id))
    sorted_results = sorted(all_results, key=lambda x: x[1], reverse=True)
    return sorted_results[:top_k]

#Evaluation
def evaluate_model_based_on_content(top_reviews, ground_truth_df):
    top_review_ids = set([listing_id for _, _, listing_id in top_reviews])
    matching_count = sum(ground_truth_df['listing_id'].isin(top_review_ids))
    match_ratio = matching_count / len(top_review_ids)
    return match_ratio

#Match Ratio
def calculate_match_ratio(top_reviews, ground_truth_df):
    top_review_ids = [listing_id for _, _, listing_id in top_reviews]
    matching_count = 0
    for review_id in top_review_ids:
        if review_id in ground_truth_df['listing_id'].values:
            matching_count += 1
    return matching_count / len(top_reviews) if top_reviews else 0

#Mean Reciprocal Rank (MRR)
def calculate_mrr(top_reviews, ground_truth_embeddings):
    reciprocal_ranks = []
    for _, _, listing_id in top_reviews:
        if listing_id in ground_truth_embeddings:
            rank = 1
            reciprocal_ranks.append(1 / rank)
    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0


def calculate_precision_recall_f1_accuracy(top_reviews, ground_truth_df, total_items):
    ground_truth_ids = set(ground_truth_df['listing_id'])
    predictions = [listing_id for _, _, listing_id in top_reviews]

    # True Positives (TP): Predicted items that are in the ground truth
    TP = sum([1 for pred in predictions if pred in ground_truth_ids])

    # False Positives (FP): Predicted items not in the ground truth
    FP = sum([1 for pred in predictions if pred not in ground_truth_ids])

    # False Negatives (FN): Ground truth items not in predictions
    FN = sum([1 for true_id in ground_truth_ids if true_id not in predictions])

    # Assuming the rest are True Negatives (TN)
    TN = total_items - (TP + FP)

    # Precision, Recall, F1-Score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Accuracy
    accuracy = (TP + TN) / (TP + FP + FN + TN) if (TP + FP + FN + TN) > 0 else 0

    return precision, recall, f1, accuracy


if __name__ == "__main__":
    user_query = input("Enter your query: ")
    query_embedding = get_query_embedding(user_query, model)

    top_reviews = find_similar_reviews(query_embedding, grouped_main_embeddings)
    match_ratio = calculate_match_ratio(top_reviews, ground_truth_no_incentive)
    mrr = calculate_mrr(top_reviews, grouped_ground_truth_embeddings)

    print("Top 5 Similar Reviews:")
    for review_index, score, listing_id in top_reviews:
        print(f"Listing ID: {listing_id}, Similarity Score: {score:.3f}")

    total_items = len(df)
    precision, recall, f1 = calculate_precision_recall_f1(top_reviews, ground_truth_no_incentive)

    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1:.3f}")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Match Ratio: {match_ratio:.3f}")
    print(f"Mean Reciprocal Rank (MRR): {mrr:.3f}")

    columns = ['Review Index', 'Similarity Score', 'Listing ID']
    weights_df = pd.DataFrame(top_reviews, columns=columns)
    weights_df.to_csv('/content/listing_id_similarity_scores.csv', index=False)
