<a href="https://colab.research.google.com/github/Mukii21/GreenAI/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re

# Download NLTK stopwords
nltk.download('stopwords')

# Sample data: Comments and their labels (1 for offensive, 0 for not)
comments = [
    "You are so stupid and annoying.",
    "Great work! Keep it up.",
    "I hate you, you idiot!",
    "This is a beautiful day.",
    "You are such a loser.",
    "Thank you for your help.",
    "You are dumb and pathetic.",
    "That was a fantastic performance!",
]
labels = [1, 0, 1, 0, 1, 0, 1, 0]

# Step 1: Text Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Preprocess all comments
processed_comments = [preprocess_text(comment) for comment in comments]

# Step 2: Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_comments)
sequences = tokenizer.texts_to_sequences(processed_comments)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Pad sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.25, random_state=42)

# Step 4: Build the RNN Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    SimpleRNN(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the Model
model.fit(np.array(X_train), np.array(y_train), epochs=5, batch_size=2, verbose=1)

# Step 6: Predict Offensive Comments
def detect_offensive_comments(comments):
    offensive_texts = []
    for comment in comments:
        processed_comment = preprocess_text(comment)
        sequence = tokenizer.texts_to_sequences([processed_comment])
        padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
        prediction = model.predict(padded_sequence, verbose=0)
        if prediction[0][0] > 0.5:
            offensive_texts.append(comment)
    return offensive_texts

# Step 7: Test the Model
offensive_comments = detect_offensive_comments(comments)
print("Offensive Comments Detected:")
for comment in offensive_comments:
    print("-", comment)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.2292 - loss: 0.6954  
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9167 - loss: 0.6473
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9167 - loss: 0.5919
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7292 - loss: 0.5696
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 1.0000 - loss: 0.4987
Offensive Comments Detected:
- You are so stupid and annoying.
- Great work! Keep it up.
- I hate you, you idiot!
- You are such a loser.
- Thank you for your help.
- You are dumb and pathetic.
