##Semantic Links Between Incentive and NoIncentive Reviews

##**Feature Extraction**

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random

nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv()

incentive_df = df[df['Incentivized'] == 'Incentive'].sample(n=15000, random_state=1)
noincentive_df = df[df['Incentivized'] == 'NoIncentive'].sample(n=15000, random_state=1)

# Preprocessing
def preprocess(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

incentive_df['processed_review'] = incentive_df['preprocessed_CombinedString'].apply(preprocess)
noincentive_df['processed_review'] = noincentive_df['preprocessed_CombinedString'].apply(preprocess)

# Extract top n trigrams
def get_top_n_trigrams(corpus, n=40):
    vectorizer = TfidfVectorizer(ngram_range=(3, 3))
    X = vectorizer.fit_transform(corpus)
    scores = np.array(X.sum(axis=0)).ravel()
    indices = np.argsort(scores)[::-1]
    feature_names = np.array(vectorizer.get_feature_names_out())
    top_features = [(feature_names[i], scores[i]) for i in indices[:n]]
    return dict(top_features)

top_incentive_trigrams = get_top_n_trigrams(incentive_df['processed_review'])
top_noincentive_trigrams = get_top_n_trigrams(noincentive_df['processed_review'])

# Graphing
def plot_top_trigrams(top_trigrams, title):
    trigrams, freqs = zip(*top_trigrams.items())
    plt.figure(figsize=(10, 10.5))
    bars = plt.barh(trigrams, freqs, color='blue')
    plt.barh(trigrams, freqs)
    plt.xlabel('TF-IDF Score')
    plt.title(f'Top 40 Trigrams in {title} Combined Strings')

    for bar in bars:
        width = bar.get_width()
        label_x_pos = width + 0.01
        plt.text(label_x_pos, bar.get_y() + bar.get_height() / 2, f'{width:.2f}',
                 va='center')
    plt.gca().invert_yaxis()
    plt.show()

plot_top_trigrams(top_incentive_trigrams, 'Incentive')
plot_top_trigrams(top_noincentive_trigrams, 'NoIncentive')

##**Evaluate Relationships**

In [None]:
all_trigrams = set(top_incentive_trigrams.keys()) | set(top_noincentive_trigrams.keys())

#create vector
incentive_vector = [top_incentive_trigrams.get(trigram, 0) for trigram in all_trigrams]
noincentive_vector = [top_noincentive_trigrams.get(trigram, 0) for trigram in all_trigrams]

from sklearn.metrics.pairwise import cosine_similarity


incentive_vector_2d = np.array(incentive_vector).reshape(1, -1)
noincentive_vector_2d = np.array(noincentive_vector).reshape(1, -1)

# Compute cosine similarity
similarity = cosine_similarity(incentive_vector_2d, noincentive_vector_2d)[0][0]

print(f"Cosine Similarity between Incentive and NoIncentive reviews: {similarity:.3f}")


incentive_scores = list(top_incentive_trigrams.values())
noincentive_scores = list(top_noincentive_trigrams.values())

from scipy import stats

# Perform T-test
t_stat, p_value = stats.ttest_ind(incentive_scores, noincentive_scores, equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

combined_trigrams = {}
for trigram in set(top_incentive_trigrams.keys()).union(set(top_noincentive_trigrams.keys())):
    combined_score = top_incentive_trigrams.get(trigram, 0) + top_noincentive_trigrams.get(trigram, 0)
    combined_trigrams[trigram] = combined_score

#Top 60 trigrams
top_60_trigrams = sorted(combined_trigrams.items(), key=lambda x: x[1], reverse=True)[:60]


for trigram, score in top_60_trigrams:
    print(f"Trigram: {trigram}, Score: {score}")


##**Recommendation and Evaluation**

In [None]:
!pip install -U sentence-transformers
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Load data
df = pd.read_csv('/content/preprocessed_english_reviews.csv', encoding='utf-8')

# Split the data
main_data, ground_truth_data = train_test_split(df, test_size=0.4, random_state=42)

# Preprocessing
main_data_no_incentive = main_data[main_data['Incentivized'] == 'NoIncentive']
main_data_no_incentive['preprocessed_reviews'] = main_data_no_incentive['preprocessed_CombinedString'].apply(preprocess_text)

vectorizer = TfidfVectorizer(ngram_range=(3, 3))
tfidf_matrix = vectorizer.fit_transform(main_data_no_incentive['preprocessed_reviews'])

tfidf_scores_per_listing = {}
for idx, listing_id in enumerate(main_data_no_incentive['listing_id']):
    if listing_id in tfidf_scores_per_listing:
        tfidf_scores_per_listing[listing_id] += tfidf_matrix[idx].toarray()[0]
    else:
        tfidf_scores_per_listing[listing_id] = tfidf_matrix[idx].toarray()[0]

# Normalize the TF-IDF scores
for listing_id in tfidf_scores_per_listing:
    tfidf_scores_per_listing[listing_id] /= np.linalg.norm(tfidf_scores_per_listing[listing_id])

# Preprocess ground_truth_data
ground_truth_no_incentive = ground_truth_data[ground_truth_data['Incentivized'] == 'NoIncentive']
ground_truth_no_incentive['preprocessed_reviews'] = ground_truth_no_incentive['preprocessed_CombinedString'].apply(preprocess_text)


ground_truth_tfidf_matrix = vectorizer.transform(ground_truth_no_incentive['preprocessed_reviews'])

#Process query
def process_query_and_find_matches(query, vectorizer, tfidf_scores_per_listing):
    preprocessed_query = preprocess_text(query)
    query_vector = vectorizer.transform([preprocessed_query])

    similarities = {listing_id: cosine_similarity(query_vector, np.array([scores])).flatten()[0]
                    for listing_id, scores in tfidf_scores_per_listing.items()}

    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)

def calculate_precision_recall_f1_accuracy(top_reviews, ground_truth_df, total_items):
    ground_truth_ids = set(ground_truth_df['listing_id'])
    predictions = [listing_id for listing_id, _ in top_reviews]

    TP = sum([1 for pred in predictions if pred in ground_truth_ids])
    FP = sum([1 for pred in predictions if pred not in ground_truth_ids])
    FN = sum([1 for true_id in ground_truth_ids if true_id not in predictions])
    TN = total_items - (TP + FP)

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (TP + TN) / (TP + FP + FN + TN) if (TP + FP + FN + TN) > 0 else 0

    return precision, recall, f1, accuracy

def calculate_match_ratio(top_reviews, ground_truth_df):
    top_review_ids = [listing_id for listing_id, _ in top_reviews]

    print("Top Review IDs:", top_review_ids)

    matching_count = 0
    for review_id in top_review_ids:
        if review_id in ground_truth_df['listing_id'].values:
            matching_count += 1


    print("Matching Count:", matching_count)
    print("Length of Top Reviews:", len(top_reviews))

    # Calculate match ratio
    return matching_count / len(top_reviews) if top_reviews else 0

def calculate_mrr(top_reviews, ground_truth_df):
    reciprocal_ranks = []
    ground_truth_ids = set(ground_truth_df['listing_id'])
    for rank, (listing_id, _) in enumerate(top_reviews, start=1):
        if listing_id in ground_truth_ids:
            reciprocal_ranks.append(1 / rank)
            break
    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0


if __name__ == "__main__":
    user_query = input("Enter your query: ")
    top_matches = process_query_and_find_matches(user_query, vectorizer, tfidf_scores_per_listing)[:5]

    print("Top 5 Listing IDs:")
    for listing_id, score in top_matches:
        print(f"Listing ID: {listing_id}, Similarity Score: {score:.3f}")

    total_items = len(df)
    precision, recall, f1, accuracy = calculate_precision_recall_f1_accuracy(top_matches, ground_truth_data, total_items)
    match_ratio = calculate_match_ratio(top_matches, ground_truth_data)
    mrr = calculate_mrr(top_matches, ground_truth_data)

    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-Score: {f1:.3f}")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Match Ratio: {match_ratio:.3f}")
    print(f"Mean Reciprocal Rank (MRR): {mrr:.3f}")

    # Visualization
    metrics = ['Precision', 'Recall', 'F1-Score', 'Accuracy', 'Match Ratio', 'MRR']
    scores = [precision, recall, f1, accuracy, match_ratio, mrr]

    plt.figure(figsize=(10, 6))
    plt.plot(metrics, scores, marker='o', color='skyblue', linestyle='-', linewidth=2)
    plt.xlabel('Metrics')
    plt.ylabel('Scores')
    plt.title('Performance Evaluation')
    plt.ylim(-0.1, 1.1)

    for i in range(len(scores)):
        plt.text(metrics[i], scores[i] + 0.05, f'{scores[i]:.2f}', ha='center', va='bottom')

    plt.show()


