In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# For RNN
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Data loading
data = r"C:\Users\agaro\Documents\GitHub\Chiron\algorithm\dataset1.csv"
df = pd.read_csv(data)
print(df.head())

                                                link  \
0  https://www.infowars.com/posts/the-wall-is-rea...   
1  https://www.infowars.com/posts/post-vaccinatio...   
2  https://www.infowars.com/posts/half-of-all-wom...   
3  https://www.infowars.com/posts/high-levels-of-...   
4  https://www.infowars.com/posts/epidemiologist-...   

                                               title  annotation  \
0  The Wall Is Real: Half of All Thirty-Something...           0   
1  “Post-Vaccination Syndrome:” New Paper Identif...           0   
2  Half of All Women in US Aged 30-35 Experiencin...           0   
3  High Levels of Microplastics Found in Lungs of...           0   
4  Epidemiologist Reveals New Data Linking Covid ...           0   

                                             content  
0  It’s a staple of the so-called “manosphere”—th...  
1  A new condition called “post-vaccination syndr...  
2  Half of all women in their 30s are now reporti...  
3  The lungs of birds contain sign

In [4]:
# Check for missing or empty values
uncleanContents = df[df['content'].isna() | (df['content'].str.strip() == '')].index
print(f'Indices with missing or empty content: {uncleanContents}')
df['content'].fillna('', inplace=True)
print(f"Total rows: {len(df)}")

Indices with missing or empty content: Index([58, 185, 203, 210, 274, 289, 2018, 2019, 2020, 2021], dtype='int64')
Total rows: 2801


In [6]:
#importing nlp library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# stop_words = set(stopwords.words('english'))
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Text preprocessing functions
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation and i not in ["'", '"', '—', '“', '”', '’', '––', '–']])
    return punctuationfree

def remove_stopwords(text):
    # Initialize stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    custom_stopwords = ["'", '"', '—', '“', '”', '’', '––', '–', 'said', 'human', 'people', 'health', 'also', 'would', 'could', 'said', '2025', '2024', 'dec', 'feb', 'jan', 'nov', 'oct', 'sept', 'aug', 'july', 'june',
                                                                                                    'may', 'april', 'march']
    stop_words = stop_words | set(custom_stopwords)
        
    # Remove stopwords
    return " ".join([word for word in text.split() if word.lower() not in stop_words])

def lemmatize_text(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    return " ".join([wordnet_lemmatizer.lemmatize(word) for word in words])

# Apply preprocessing steps
df['clean_text'] = df['content'].apply(remove_punctuation)
df['clean_text'] = df['clean_text'].str.lower()
df['clean_text'] = df['clean_text'].apply(remove_stopwords)
df['clean_text'] = df['clean_text'].apply(lemmatize_text)

# Show preprocessing results
print("Original text example:")
print(df['content'].iloc[0][:100], "...\n")
print("Preprocessed text example:")
print(df['clean_text'].iloc[0][:100], "...\n")

Original text example:
It’s a staple of the so-called “manosphere”—that glamorous corner of the internet where sex-traffick ...

Preprocessed text example:
staple socalled manospherethat glamorous corner internet sextrafficker guru rub shoulder pickup arti ...



In [9]:
# Prepare data for RNN
def prepare_data_for_rnn(df, max_words=10000, max_sequence_length=100):
    # Texts and labels
    texts = df['clean_text'].values
    labels = df['annotation'].values
    
    # Convert labels to integers if they're not already
    if not isinstance(labels[0], (int, np.integer)):
        label_mapping = {label: i for i, label in enumerate(set(labels))}
        labels = np.array([label_mapping[label] for label in labels])
        print(f"Label mapping: {label_mapping}")
    
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # Tokenize the text
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)
    
    # Convert text to sequences
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
    
    # Pad sequences to ensure uniform length
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)
    
    # Get vocabulary size (for embedding layer)
    vocab_size = len(tokenizer.word_index) + 1
    print(f"Vocabulary size: {vocab_size}")
    
    # Convert labels to categorical for multi-class classification if needed
    num_classes = len(np.unique(labels))
    if num_classes > 2:
        y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
        y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
        print(f"Using categorical labels for {num_classes} classes")
    
    return X_train_padded, X_test_padded, y_train, y_test, vocab_size, num_classes, tokenizer

In [10]:
# Build RNN model
def build_rnn_model(vocab_size, embedding_dim=100, max_sequence_length=100, num_classes=2):
    model = Sequential()
    
    # Embedding layer
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=embedding_dim, 
                        input_length=max_sequence_length))
    
    # Bidirectional LSTM layers
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dropout(0.2))
    
    # Dense layers
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    
    # Output layer
    if num_classes == 2:
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else:
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Summary
    model.summary()
    return model

In [11]:
# Train and evaluate RNN model
def train_evaluate_rnn(X_train, X_test, y_train, y_test, vocab_size, num_classes):
    # Model parameters
    embedding_dim = 100
    max_sequence_length = X_train.shape[1]
    
    # Build model
    model = build_rnn_model(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        max_sequence_length=max_sequence_length,
        num_classes=num_classes
    )
    
    # Callbacks for better training
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )
    
    model_checkpoint = ModelCheckpoint(
        filepath='best_rnn_model.h5',
        monitor='val_loss',
        save_best_only=True
    )
    
    # Train the model
    print("Training RNN model...")
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.1,
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
    )
    
    # Evaluate the model
    print("\nEvaluating RNN model on test data...")
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # Generate predictions
    if num_classes == 2:
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype(int).flatten()
        y_test_flat = y_test  # Already flat for binary
    else:
        y_pred_prob = model.predict(X_test)
        y_pred = np.argmax(y_pred_prob, axis=1)
        y_test_flat = np.argmax(y_test, axis=1)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test_flat, y_pred))
    
    return model, history

In [12]:
# Compare with simpler model (for benchmarking)
def train_simple_model(X_train, X_test, y_train, y_test, vocab_size, num_classes):
    # Simpler model with single LSTM layer
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=X_train.shape[1]))
    model.add(LSTM(32))
    model.add(Dense(16, activation='relu'))
    
    if num_classes == 2:
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else:
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1, verbose=1)
    
    # Evaluate
    _, accuracy = model.evaluate(X_test, y_test)
    print(f"\nSimple LSTM Model Accuracy: {accuracy:.4f}")
    return model

# Main execution function
def main():
    print("Starting RNN classification pipeline...")
    
    # Prepare data
    X_train, X_test, y_train, y_test, vocab_size, num_classes, tokenizer = prepare_data_for_rnn(df)
    
    # Train and evaluate RNN model
    rnn_model, history = train_evaluate_rnn(X_train, X_test, y_train, y_test, vocab_size, num_classes)
    
    # Optional: Train simpler model for comparison
    simple_model = train_simple_model(X_train, X_test, y_train, y_test, vocab_size, num_classes)
    
    # Return results
    return {
        'rnn_model': rnn_model,
        'tokenizer': tokenizer,
        'history': history,
        'simple_model': simple_model
    }

# Entry point
if __name__ == "__main__":
    results = main()

Starting RNN classification pipeline...
Vocabulary size: 44312




Training RNN model...
Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 0.5778 - loss: 0.6611



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 190ms/step - accuracy: 0.5813 - loss: 0.6584 - val_accuracy: 0.8973 - val_loss: 0.3300
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.9604 - loss: 0.1522



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 164ms/step - accuracy: 0.9606 - loss: 0.1514 - val_accuracy: 0.9286 - val_loss: 0.2138
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 155ms/step - accuracy: 0.9912 - loss: 0.0437 - val_accuracy: 0.9286 - val_loss: 0.2859
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step - accuracy: 0.9968 - loss: 0.0152 - val_accuracy: 0.9286 - val_loss: 0.3288
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 156ms/step - accuracy: 0.9996 - loss: 0.0041 - val_accuracy: 0.9286 - val_loss: 0.3610

Evaluating RNN model on test data...
Test Loss: 0.1900
Test Accuracy: 0.9376
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       276
           1       0.91      0.98      0.94       285

    accuracy           



[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - accuracy: 0.7332 - loss: 0.6760 - val_accuracy: 0.8884 - val_loss: 0.4467
Epoch 2/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.9260 - loss: 0.3220 - val_accuracy: 0.9062 - val_loss: 0.2385
Epoch 3/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9719 - loss: 0.1032 - val_accuracy: 0.9196 - val_loss: 0.2191
Epoch 4/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9958 - loss: 0.0316 - val_accuracy: 0.9018 - val_loss: 0.2372
Epoch 5/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step - accuracy: 0.9964 - loss: 0.0158 - val_accuracy: 0.9241 - val_loss: 0.2903
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9534 - loss: 0.1694

Simple LSTM Model Accuracy: 0.9412
