In [None]:
!pip install textblob transformers
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit
import torch

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load your data
df = pd.read_csv('/content/Merged DB.csv')

# Load the toxic-bert model onto GPU (if available)
model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def detect_toxicity_label(text):
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    scores = expit(logits.cpu().numpy())[0]  # move back to CPU before converting
    result = {label: score for label, score in zip(labels, scores)}
    top_label = max(result, key=result.get)
    top_score = result[top_label]
    return f"{round(top_score, 3)}"

# Batching settings
batch_size = 500
num_batches = len(df) // batch_size + 1

for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(df))
    df_batch = df.iloc[start:end].copy()
    print(f"Processing batch {i+1}/{num_batches} (rows {start} to {end})")

    df_batch['sentiment'] = df_batch['tweet'].apply(detect_toxicity_label)

    # Save batch to CSV
    batch_file = f'/content/toxicity_output_batch_{i+1}.csv'
    df_batch.to_csv(batch_file, index=False)
    print(f"Saved {batch_file}")

print("All batches processed!")


In [None]:
import pandas as pd
import glob

# Find all batch CSV files (adjust the pattern if needed)
batch_files = glob.glob('/content/toxicity_output_batch_*.csv')

# Sort the files in order (optional but helps keep them ordered)
batch_files.sort()

# Read and combine all batches
combined_df = pd.concat([pd.read_csv(f) for f in batch_files], ignore_index=True)

# Save the combined file
combined_df.to_csv('/content/toxicity_output_combined.csv', index=False)

print(f"Combined {len(batch_files)} files into 'toxicity_output_combined.csv'")


In [None]:
import pandas as pd
import re

# Load your dataset
dataset = pd.read_csv('/content/dataset.csv')

# Clean and count words (excluding links that start with http)
def count_words_excluding_links(text):
    words = str(text).split()
    filtered_words = [word for word in words if not word.lower().startswith("http")]
    return len(filtered_words)

dataset['tweet_length'] = dataset['tweet'].apply(count_words_excluding_links)

# Count hashtags
dataset['hashtag_count'] = dataset['tweet'].astype(str).apply(lambda x: len(re.findall(r'#\w+', x)))

# Preview
print(dataset[['tweet', 'tweet_length', 'hashtag_count']].head())

# Save and download
dataset.to_csv('/content/dataset_cleaned.csv', index=False)

from google.colab import files
files.download('/content/dataset_cleaned.csv')
