In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import gensim.downloader as api


In [3]:
# # Step 1: Load a pre-trained Word2Vec model
# word2vec_model = api.load('word2vec-google-news-300')

In [26]:
# prompt: mount to my drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
train_path = r'/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/train.csv'
test_path = r'/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/test.csv'
test_label_path = r'/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/test_labels.csv'

In [6]:
# Create the DataFrame
df = pd.read_csv(train_path)

# Drop the 'id' column (not present in our DataFrame)
df = df.drop(columns=['id'], errors='ignore')  # There is no 'id' column to drop

# Display the modified DataFrame
df.head(10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0


In [7]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = df[label_columns].values

In [8]:
X_train = df['comment_text'].values
print(X_train.shape)
print(y_train.shape)

(159571,)
(159571, 6)


In [9]:
max_words = 1000    #max tokens to tok
max_len = 30       #for padding up to 30 element before fetch to LSTM
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)  # Fit tokenizer on training data

In [10]:
sequences = tok.texts_to_sequences(X_train)  # Convert texts to sequences
sequences_pad = sequence.pad_sequences(sequences, maxlen=max_len)  # Pad sequences

In [13]:
#define embed matrix
word_index = tok.word_index
embedding_dim = 300
vocab_size = len(word_index) + 1        #+1 for the end of word symbol
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
    else:
        # Handle out-of-vocabulary words
        embedding_matrix[i] = np.zeros(embedding_dim)

In [None]:
def RNN(vocab_size, embedding_dim, max_len, embedding_matrix, num_class):
    # Input layer
    inputs = Input(name='inputs', shape=[max_len])

    # Embedding layer
    layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len,trainable = False)(inputs)

    # LSTM layer
    layer = LSTM(128)(layer)

    # Fully connected layer
    layer = Dense(128, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)

    # Output layer
    layer = Dense(6, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)

    # Create model
    model = Model(inputs=inputs, outputs=layer)


    return model
model = RNN(vocab_size, embedding_dim, max_len, embedding_matrix,6)
 # Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Change here

In [15]:
# from tensorflow.keras.models import load_model
# model = load_model('/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/word2vec_LSTM.h5')




In [None]:
history = model.fit(sequences_pad, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
# model.save('/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/word2vec_LSTM.h5')


In [49]:
test_df = pd.read_csv(test_path)
test_label = pd.read_csv(test_label_path)

# Ensure 'id' is treated as string
test_df['id'] = test_df['id'].astype(str)

# Display the first 10 rows of test labels (optional)
print(test_df['id'].apply(type))

0         <class 'str'>
1         <class 'str'>
2         <class 'str'>
3         <class 'str'>
4         <class 'str'>
              ...      
153159    <class 'str'>
153160    <class 'str'>
153161    <class 'str'>
153162    <class 'str'>
153163    <class 'str'>
Name: id, Length: 153164, dtype: object


In [18]:
sequences_new = tok.texts_to_sequences(test_df['comment_text'])

# 2. Pad the sequences to the same length used in training
padded_sequences_new = sequence.pad_sequences(sequences_new, maxlen=max_len)

# Step 2: Use the trained model to make predictions
predictions = model.predict(padded_sequences_new)

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 26ms/step


In [19]:
print(predictions)

[[8.9993334e-01 2.0990841e-02 5.9293115e-01 1.4573090e-02 4.7380874e-01
  3.2631818e-02]
 [3.7438499e-03 1.7683934e-09 4.6536926e-04 6.1914252e-09 2.9835696e-04
  1.1742827e-06]
 [2.1702051e-03 2.7484424e-11 2.9624000e-05 5.2696669e-10 2.3654566e-05
  5.1748383e-08]
 ...
 [3.5630490e-04 2.3154601e-13 1.3585882e-05 4.4515693e-13 6.1024853e-06
  3.0069964e-09]
 [6.5004663e-03 2.3691704e-10 4.3089134e-05 4.5875104e-09 7.4958014e-05
  3.3697426e-05]
 [7.6122844e-01 2.6379144e-05 2.2751211e-01 4.5089764e-06 7.0165999e-02
  2.1628343e-04]]


In [50]:
userid = test_df['id'].values

In [51]:
data = {
    'id': userid,
    'toxic': [pred[0] for pred in predictions],
    'severe_toxic': [pred[1] for pred in predictions],
    'obscene': [pred[2] for pred in predictions],
    'threat': [pred[3] for pred in predictions],
    'insult': [pred[4] for pred in predictions],
    'identity_hate': [pred[5] for pred in predictions]
}

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)
df['id'] = df['id'].astype(str)
# Save DataFrame to CSV
output_file = '/content/drive/MyDrive/toxic_comment_classify_using_word2vec_lstm/submission.csv'  # Specify the desired output file name
df.to_csv(output_file, index=False)

print("CSV file has been created successfully!")

CSV file has been created successfully!
