In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding
from ipywidgets import widgets
from IPython.display import display

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path="/content/drive/MyDrive/Dataset/comment_toxicity_dataset/train.csv"

In [None]:
df=pd.read_csv(path)

In [None]:
a=df["comment_text"]
print(a)

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object


In [None]:
b=df[df.columns[2:]]
print(b)

        toxic  severe_toxic  obscene  threat  insult  identity_hate
0           0             0        0       0       0              0
1           0             0        0       0       0              0
2           0             0        0       0       0              0
3           0             0        0       0       0              0
4           0             0        0       0       0              0
...       ...           ...      ...     ...     ...            ...
159566      0             0        0       0       0              0
159567      0             0        0       0       0              0
159568      0             0        0       0       0              0
159569      0             0        0       0       0              0
159570      0             0        0       0       0              0

[159571 rows x 6 columns]


In [None]:
b=b.values
print(b)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [None]:
Max_input=700000

In [None]:
vectorize=TextVectorization(max_tokens=Max_input,
                            output_sequence_length=2000,
                            output_mode='int')

In [None]:
vectorize.adapt(a.values)

In [None]:
vectorize.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [None]:
vectorize("Life sucks right now")[:4]

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([306, 743, 148,  90])>

In [None]:
vectorized_words=vectorize(a.values)

In [None]:
print(vectorized_words)

tf.Tensor(
[[   645     76      2 ...      0      0      0]
 [219427     54   2489 ...      0      0      0]
 [   425    441     70 ...      0      0      0]
 ...
 [ 32445   7392    383 ...      0      0      0]
 [     5     12    534 ...      0      0      0]
 [     5      8    130 ...      0      0      0]], shape=(159571, 2000), dtype=int64)


In [None]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_words,b))
dataset=dataset.cache()
dataset=dataset.shuffle(10000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [None]:
batch_a,batch_b=dataset.as_numpy_iterator().next()

In [None]:
batch_a.shape,batch_b.shape

((16, 2000), (16, 6))

In [None]:
train_dataset=dataset.take(int(len(dataset)*0.8))
val_dataset=dataset.skip(int(len(dataset)*.8)).take(int(len(dataset)*.1))
val_dataset=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

Sequential **Model**

In [None]:
model = Sequential()
model.add(Embedding(Max_input+1, 32))
model.add(Bidirectional(LSTM(64, activation='tanh')))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics="accuracy")

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          22400032  
                                                                 
 bidirectional (Bidirection  (None, 128)               49664     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 128)               8320      
                                                                 
 dense_3 (Dense)             (None, 128)               16512     
                                                        

In [None]:
history = model.fit(train_dataset, epochs=5, validation_data=val_dataset,verbose=1)#Neglect the accuracy and val accuracy metrics

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("/content/drive/MyDrive/Em assignments/commentpredictor.h5")

  saving_api.save_model(


In [None]:
path1="/content/drive/MyDrive/Dataset/comment_toxicity_dataset/test.csv"

In [None]:
df1=pd.read_csv(path1)

In [None]:
c=df1["comment_text"]
print(c)

0         Yo bitch Ja Rule is more succesful then you'll...
1         == From RfC == \n\n The title is fine as it is...
2         " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3         :If you have a look back at the source, the in...
4                 I don't anonymously edit articles at all.
                                ...                        
153159    . \n i totally agree, this stuff is nothing bu...
153160    == Throw from out field to home plate. == \n\n...
153161    " \n\n == Okinotorishima categories == \n\n I ...
153162    " \n\n == ""One of the founding nations of the...
153163    " \n :::Stop already. Your bullshit is not wel...
Name: comment_text, Length: 153164, dtype: object


In [None]:
test=vectorize(c.values)

In [None]:
from tensorflow.keras.metrics import Precision,CategoricalAccuracy,Recall

Evaluation

In [None]:
pre = Precision()
acc = CategoricalAccuracy()
rec=Recall()

In [None]:
for batch in test.as_numpy_iterator():
    X_val, y_val = batch
    # Make prediction
    ynew = model.predict(X_val)

    # Flatten
    y_val = y_val.flatten()
    ynew = ynew.flatten()

    pre.update_state(y_val, ynew)
    acc.update_state(y_val, ynew)
    rec.update_state(y_val, ynew)


AttributeError: ignored

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/Em assignments/commentpredictor.h5')

In [None]:
input_str = vectorize('hey i love you')

In [None]:
res = model.predict(np.expand_dims(input_str,0))



In [None]:
res

array([[6.0983904e-02, 9.4522354e-09, 3.4920590e-03, 1.8019957e-04,
        8.9868214e-03, 2.1254356e-04]], dtype=float32)

In [None]:
from ipywidgets import widgets
from IPython.display import display

def score_comment(comment):
    # Use the adapted TextVectorization layer to vectorize the comment
    vectorized_comment = vectorize(np.array([comment]))
    results = model.predict(vectorized_comment)

    result_dict = {}
    for idx, col in enumerate(df.columns[2:]):
        result_dict[col] = results[0][idx] > 0.5

    return result_dict

# Create widgets
comment_textbox = widgets.Textarea(
    value='',
    placeholder='Enter your comment here',
    description='Comment:',
    disabled=False
)

score_button = widgets.Button(
    description='Score',
    disabled=False,
    button_style='success',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to score the comment'
)

result_output = widgets.Output()

# Define the event handler for the button click
def on_button_click(b):
    result_output.clear_output()
    comment = comment_textbox.value
    result = score_comment(comment)
    with result_output:
        print("Result:")
        for key, value in result.items():
            print(f'{key}: {value}')

# Attach the event handler to the button
score_button.on_click(on_button_click)

# Display widgets
display(comment_textbox)
display(score_button)
display(result_output)


Textarea(value='', description='Comment:', placeholder='Enter your comment here')

Button(button_style='success', description='Score', style=ButtonStyle(), tooltip='Click to score the comment')

Output()

