In [138]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras import layers
from keras.utils import to_categorical

In [139]:
data = pd.read_csv("https://github.com/Hemachandra151/Datasets/raw/main/sexual%20harassmnt.csv")
data.head()

Unnamed: 0,text,label
0,I was walking alone in the park when a group o...,1
1,"While waiting at the bus stop, a man standing ...",1
2,I was walking down a deserted road at night wh...,1
3,"At the market place, a vendor kept staring at ...",1
4,A colleague at work keeps cracking jokes about...,1


In [140]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1004 non-null   object
 1   label   1004 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [141]:
data.describe()

Unnamed: 0,label
count,1004.0
mean,0.516932
std,0.499962
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [142]:
data.columns

Index(['text', 'label'], dtype='object')

#Data Preprocessing

In [143]:
texts = data['text'].values
labels = data['label'].values

#Tokenize the text
max_words = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [144]:
#Pad sequences to ensure uniform length
maxlen=70
data = pad_sequences(sequences, maxlen=maxlen)

num_classes = len(set(labels))
labels = to_categorical(labels, num_classes=num_classes)

#Train Test Split

In [145]:
x_train, x_test, y_train, y_test = tts(data, labels, random_state=2529)

#Recurrent Neural Network

In [146]:
inputs = layers.Input(shape=(maxlen,))
x = layers.Embedding(max_words, 15, input_length=maxlen)(inputs)
x = layers.SimpleRNN(10, return_sequences=True)(x)
x = layers.SimpleRNN(5)(x)
outputs = layers.Dense(2, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=32, verbose=2,validation_split=0.2)

Epoch 1/10
19/19 - 4s - loss: 0.6940 - accuracy: 0.5664 - val_loss: 0.6788 - val_accuracy: 0.6225 - 4s/epoch - 197ms/step
Epoch 2/10
19/19 - 1s - loss: 0.6421 - accuracy: 0.8239 - val_loss: 0.6537 - val_accuracy: 0.7152 - 621ms/epoch - 33ms/step
Epoch 3/10
19/19 - 1s - loss: 0.5916 - accuracy: 0.9020 - val_loss: 0.6300 - val_accuracy: 0.7285 - 606ms/epoch - 32ms/step
Epoch 4/10
19/19 - 1s - loss: 0.5393 - accuracy: 0.9419 - val_loss: 0.6079 - val_accuracy: 0.7351 - 600ms/epoch - 32ms/step
Epoch 5/10
19/19 - 1s - loss: 0.4859 - accuracy: 0.9585 - val_loss: 0.5819 - val_accuracy: 0.7616 - 880ms/epoch - 46ms/step
Epoch 6/10
19/19 - 1s - loss: 0.4321 - accuracy: 0.9801 - val_loss: 0.5600 - val_accuracy: 0.7815 - 1s/epoch - 56ms/step
Epoch 7/10
19/19 - 1s - loss: 0.3810 - accuracy: 0.9817 - val_loss: 0.5343 - val_accuracy: 0.7815 - 1s/epoch - 56ms/step
Epoch 8/10
19/19 - 1s - loss: 0.3337 - accuracy: 0.9867 - val_loss: 0.5155 - val_accuracy: 0.7815 - 1s/epoch - 58ms/step
Epoch 9/10
19/19 - 

<keras.src.callbacks.History at 0x7deb3b2b0d30>

In [147]:
model.evaluate(x_test, y_test)



[0.4998716413974762, 0.7808765172958374]

#Testing The Model

In [148]:
conf = np.zeros((2,2))
y_pred = model.predict(x_test)
for i in range(len(y_pred)):
  a=np.argmax(y_pred[i])
  b=np.argmax(y_test[i])
  conf[a][b]+=1
print(conf)

[[100.  40.]
 [ 15.  96.]]
