In [2]:
import warnings
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('labeled_comments.csv')

# Ensure comments are strings and handle NaN values
df['Comment'] = df['Comment'].astype(str).fillna('')

# Tokenization and Preprocessing
max_words = 10000 
max_len = 100      

# Using Keras Tokenizer to vectorize the text
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Comment'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['Comment'])

# Pad the sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_len)

# Encode the labels (if they are not binary, you can adjust this for multiclass classification)
df['Label'] = df['Label'].astype(str) 
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Label'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# Convolutional layer with a kernel size of 5 and 128 filters
model.add(Conv1D(128, 5, activation='relu'))

# Pooling layer: Reduces the dimensionality
model.add(GlobalMaxPooling1D())

# Dense fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')


In [None]:
# BiLSTM Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# layer with a kernel 128 filters
model.add(Bidirectional(LSTM(128, return_sequences=True)))

model.add(Bidirectional(LSTM(32)))

# Dense fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')


In [None]:
# BiGRU Model Architecture
model = Sequential()

# Embedding layer: Converts words to dense vectors of fixed size
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# layer with a kernel 128 filters
model.add(Bidirectional(GRU(128, return_sequences=True)))

model.add(Bidirectional(GRU(32)))

# Dense fully connected layer
model.add(Dense(128, activation='relu'))

# Dropout to reduce overfitting
model.add(Dropout(0.5))

# Output layer: Binary classification (you can adjust for more classes)
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on validation data
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {val_acc}')

# Make predictions on the validation set
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print(f'Classification Report:\n{classification_report(y_val, y_pred)}')

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Read the dataset
df = pd.read_csv('data/labeled_comments.csv')

df['Comment'] = df['Comment'].astype(str).fillna('')

# Add a dummy label column to start with (you'll replace this later after labeling)
df['Label'] = 0  

train_texts, val_texts, train_labels, val_labels = train_test_split(df['Comment'], df['Label'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512)

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

# Use DataLoader for batching
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training function
def train(model, train_loader):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch_input_ids, batch_attention_mask, batch_labels = batch
        model.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, val_loader):
    model.eval()
    val_predictions, val_labels_list = [], []
    for batch in val_loader:
        batch_input_ids, batch_attention_mask, batch_labels = batch
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        val_predictions.extend(predictions)
        val_labels_list.extend(batch_labels.cpu().numpy())
    return accuracy_score(val_labels_list, val_predictions), classification_report(val_labels_list, val_predictions)

# Training Loop
epochs = 3 
for epoch in range(epochs):
    train_loss = train(model, train_loader)
    print(f'Epoch {epoch + 1}/{epochs} | Training Loss: {train_loss}')
    val_accuracy, val_report = evaluate(model, val_loader)
    print(f'Validation Accuracy: {val_accuracy}')
    print(val_report)
# # Predict on all comments (label the dataset)
# def predict_comments(model, comments):
#     encodings = tokenize_data(comments)
#     input_ids = encodings['input_ids']
#     attention_mask = encodings['attention_mask']
#     with torch.no_grad():
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=1).cpu().numpy()
#     return predictions

# # Apply the model to the entire dataset
# all_comments = df['Comment']
# df['Label'] = predict_comments(model, all_comments)
# df.head()

# Save the labeled dataset to a new CSV
#df.to_csv('labeled_comments.csv', index=False)