In [None]:
from sklearn.model_selection import train_test_split
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

In [None]:
# Assuming the dataset is in a CSV file
df = pd.read_csv("LIAR2/train.csv", delimiter=',')
statements = df['statement'].dropna().tolist() 
subjects = df['subject'].dropna().tolist()  

In [None]:


# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(statements, subjects, test_size=0.2, random_state=42)

# Initialize a tokenizer and model for embeddings (using a pre-trained BERT model)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

# Function to encode text using BERT embeddings
def encode_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).logits
    return embeddings.numpy()

# Use the above function to encode the training and test texts
X_train_embeddings = encode_text(X_train)
X_test_embeddings = encode_text(X_test)

# Initialize BERTopic with supervision (use the subjects as labels)
topic_model = BERTopic(language="english")
topic_model.fit(X_train_embeddings, y_train)

# Evaluate the model
y_pred = topic_model.transform(X_test_embeddings)
print(classification_report(y_test, y_pred))


In [None]:
print("""NOTNEEDEdCODE/n df = pd.read_csv("LIAR2/train.csv")

## Fr traingin data
# Convert categorical columns to numeric using LabelEncoder
categorical_cols = ['subject', 'speaker', 'speaker_description', 'state_info', 'context', 'justification']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Store encoder for potential inverse transformation

# Handle missing values (fill with 0 for simplicity)
df = df.fillna(0)

# Separate features and labels
features = df.drop(columns=['label', 'id', 'statement', 'date'])
labels = df['label']

# Step 3: Convert to PyTorch Tensors
features_tensor = torch.tensor(features.values, dtype=torch.float32)
labels_tensor = torch.tensor(labels.values, dtype=torch.long)

# Step 4: Print shapes for verification
print("Features Tensor Shape:", features_tensor.shape)
print("Labels Tensor Shape:", labels_tensor.shape)
""")