In [2]:
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [4]:
# First Block of Code

# 1) Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import requests
from bs4 import BeautifulSoup
import nlpaug.augmenter.word as naw

# 2) Import several CSV files
file1 = 'consolidated_hand_labels_fixed.csv'
file2 = 'tropes_labeled_training.csv'
file3 = 'labeled_data.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# Adjusting the dataframes to have consistent columns: 'text' and 'label'

# Adjust df1: Already has 'text' and 'label'
df1 = df1[['text', 'label']]

# Adjust df2: Use 'post_text_clean' as the text and derive a binary label
df2['label'] = np.where(df2['jewish_mentions_count'] > 0, 1, 0)  # Simplified assumption for labeling
df2 = df2[['post_text_clean', 'label']]
df2.rename(columns={'post_text_clean': 'text'}, inplace=True)

# Adjust df3: Use 'tweet' as the text and map 'class' to a binary label
df3['label'] = np.where(df3['class'] == 0, 1, 0)  # Assuming class 0 represents antisemitic speech
df3 = df3[['tweet', 'label']]
df3.rename(columns={'tweet': 'text'}, inplace=True)

# 3) Combine the different CSV files into one to train the model
df_combined = pd.concat([df1, df2, df3], ignore_index=True)

# Data Augmentation
aug = naw.SynonymAug(aug_src='wordnet')
augmented_texts = []
augmented_labels = []

for text, label in zip(df_combined['text'], df_combined['label']):
    augmented_texts.append(aug.augment(text))
    augmented_labels.append(label)

# Create a new DataFrame with the augmented data
df_augmented = pd.DataFrame({'text': augmented_texts, 'label': augmented_labels})

# Combine the original and augmented data
df_combined_augmented = pd.concat([df_combined, df_augmented], ignore_index=True)

# Tokenize the augmented data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df_combined_augmented['text'])
sequences_augmented = tokenizer.texts_to_sequences(df_combined_augmented['text'])
padded_sequences_augmented = pad_sequences(sequences_augmented, maxlen=100, padding='post', truncating='post')

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(padded_sequences_augmented, df_combined_augmented['label'], test_size=0.2, random_state=42)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# 1) Create the Model, currently we're using NN
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# 2) Compile the Model with Hyperparameter Tuning
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 3) Train the Model with Augmented Data
history_augmented = model.fit(X_train_augmented, y_train_augmented, epochs=20, validation_data=(X_test_augmented, y_test_augmented), batch_size=64)

# 4) Test the Model and Print the Evaluation
y_pred_augmented = (model.predict(X_test_augmented) > 0.5).astype("int32")

print("Confusion Matrix:\n", confusion_matrix(y_test_augmented, y_pred_augmented))
print("Classification Report:\n", classification_report(y_test_augmented, y_pred_augmented))
print("Accuracy Score:\n", accuracy_score(y_test_augmented, y_pred_augmented))

# Plotting the Training and Validation Accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history_augmented.history['accuracy'], label='train accuracy')
plt.plot(history_augmented.history['val_accuracy'], label='validation accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plotting the Training and Validation Loss
plt.subplot(1, 2, 2)
plt.plot(history_augmented.history['loss'], label='train loss')
plt.plot(history_augmented.history['val_loss'], label='validation loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()




Epoch 1/20
[1m344/766[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m3:23[0m 482ms/step - accuracy: 0.8611 - loss: 0.4022

In [None]:
# 1) Allow for User Input in Either Free Text or URL of a Social Media Post
def get_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except Exception as e:
        print("Error fetching the URL content:", e)
        return None

user_input = input("Enter the text or URL: ")
if user_input.startswith("http"):
    user_text = get_text_from_url(user_input)
else:
    user_text = user_input

if user_text:
    # 2) Return What is the Classification of the Text
    seq = tokenizer.texts_to_sequences([user_text])
    padded = pad_sequences(seq, maxlen=100, padding='post', truncating='post')
    prediction = model.predict(padded)[0][0]
    print(f"The probability of this text being antisemitic is {prediction:.2f}")
else:
    print("Could not process the input.")