In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

ModuleNotFoundError: No module named 'catboost'

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import pandas as pd
import re


def assign_labels(input_file, output_file, useful_topics, num_clusters=4, custom_stop_words=None):
    # Load the dataset
    df = pd.read_csv(input_file)

    # Validate presence of 'review' column
    if 'review' not in df.columns:
        raise ValueError("The input file must contain a 'review' column.")

    # 1. Preprocess the Review Text
    def preprocess_text(text):
        text = re.sub(r'[^\w\s]', '', str(text).lower())
        return text

    df['cleaned_review'] = df['review'].apply(preprocess_text)

    # Define default custom stop words if not provided
    if custom_stop_words is None:
        custom_stop_words = [
            "app", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
            "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself",
            "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which",
            "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be",
            "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
            "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
            "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
            "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
            "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
            "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
            "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", 
            "now", "try", "keeps,", "still"
        ]

    # 2. Vectorize the Review Text Using TF-IDF (with custom stop words)
    vectorizer = TfidfVectorizer(
        max_features=1000, stop_words=custom_stop_words)
    review_vectors = vectorizer.fit_transform(df['cleaned_review'])

    # --- LDA Topic Modeling ---
    lda = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
    lda.fit(review_vectors)

    # Get the top words for each topic and store for review
    terms = vectorizer.get_feature_names_out()
    topic_words = {}
    top_words_per_topic = []  # This will store words per topic for CoherenceModel
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]  # Top 10 words per topic
        top_words = [terms[i] for i in top_words_idx]
        topic_words[f"Topic {topic_idx}"] = top_words
        # Append words as list for CoherenceModel
        top_words_per_topic.append(top_words)
        print(f"Topic {topic_idx}: {', '.join(top_words)}")

    # Predict the topics for each review
    topic_probabilities = lda.transform(review_vectors)
    df['lda_topic'] = topic_probabilities.argmax(axis=1)

    # 3. Label the Reviews as Useful or Not Useful
    # Label reviews based on their dominant topic
    df['is_useful'] = df['lda_topic'].apply(
        lambda x: 1 if x in useful_topics else 0)

    # Save the labeled dataset
    df.to_csv(output_file, index=False)

    # --- Evaluation Metrics ---
    # Perplexity Score
    perplexity = lda.perplexity(review_vectors)
    print(f"Model Perplexity: {perplexity}")

    # Coherence Score (using Gensim)
    # Prepare data for coherence calculation
    texts = df['cleaned_review'].str.split()
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Calculate coherence score using the top words per topic
    coherence_model = CoherenceModel(
        topics=top_words_per_topic, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"Model Coherence Score: {coherence_score}")

    # Return the labeled dataframe, review vectors, topic words, and evaluation metrics
    return df

Labelling Training and testing data using LDA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = assign_labels(input_file='csv_data.csv', output_file='training_data.csv',
              useful_topics=[1, 3], num_clusters=4)

# testing_df,testing_rv = assign_labels(input_file='unseen_data.csv', output_file='testing_data.csv', useful_topics=[2])

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

device = torch.device('cuda')


# 1. Load Pre-trained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to get BERT embeddings for each review


def get_bert_embeddings(reviews):
    inputs = tokenizer(reviews, return_tensors='pt',
                       padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Using the [CLS] token embedding (first token)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

df = pd.read_csv('csv_data.csv')

# 2. Assuming 'df' has the reviews in 'review' column
reviews = df['review'].tolist()

# 3. Get embeddings for all reviews
embeddings = get_bert_embeddings(reviews)

# 4. Reduce the dimensionality of the embeddings (e.g., using PCA)
pca = PCA(n_components=2)  # Reduce to 2D for visualization
reduced_embeddings = pca.fit_transform(embeddings)

# 5. Cluster the reviews using KMeans
num_topics = 4  # Adjust this based on your needs
kmeans = KMeans(n_clusters=num_topics, random_state=42)
df['topic'] = kmeans.fit_predict(embeddings)

# 6. Visualize the clusters
plt.figure(figsize=(10, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:,
            1], c=df['topic'], cmap='viridis', alpha=0.6)
plt.colorbar(label='Topic')
plt.title('BERT-based Topic Modeling')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

# 7. Get the reviews per topic
for topic in range(num_topics):
    topic_reviews = df[df['topic'] == topic]['review'].head(
        5)  # Show top 5 reviews per topic
    print(f"Topic {topic} reviews:\n", topic_reviews, "\n")