In [15]:
import pandas as pd
import spacy
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM
from sklearn.metrics import classification_report,confusion_matrix
import pickle


In [16]:
df = pd.read_csv('labeled_data.csv')

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [18]:
df.shape

(24783, 7)

In [19]:
df.isna().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [20]:
df.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [21]:
df.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],inplace=True)

In [22]:
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [23]:
df['tweet']=df['tweet'].str.replace(r'[^a-zA-z]',' ',regex=True)

In [24]:
df.head()

Unnamed: 0,class,tweet
0,2,RT mayasolovely As a woman you shouldn t...
1,1,RT mleew boy dats cold tyga dwn ba...
2,1,RT UrKindOfBrand Dawg RT sbaby...
3,1,RT C_G_Anderson viva_based she lo...
4,1,RT ShenikaRoberts The shit you...


In [25]:
df['tweet']=df['tweet'].str.replace(r'[\s]+',' ',regex=True)

In [26]:
df.head()

Unnamed: 0,class,tweet
0,2,RT mayasolovely As a woman you shouldn t comp...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,RT C_G_Anderson viva_based she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...


In [27]:
nlp = spacy.load('en_core_web_sm')

In [28]:
def preprocess(text):
  doc = nlp(text)
  lemmalist = [word.lemma_ for word in doc]
  lemma = ' '.join(lemmalist)

  doc = nlp(lemma)
  no_stopwords_list = [word.text for word in doc if not word.is_stop]
  no_stopwords = ' '.join(no_stopwords_list)

  return no_stopwords

In [29]:
df['tweet'] = df['tweet'].apply(preprocess)

In [30]:
vocab_size = 10000
one_hot_representation = [one_hot(words,vocab_size)for words in df['tweet']]

In [31]:
one_hot_representation[0]

[9861, 8731, 7639, 7229, 274, 3025, 2413, 5507, 1961, 4735, 4469]

In [32]:
sentence_length = 20
embedded_tweet = pad_sequences(one_hot_representation,padding='post',maxlen = sentence_length)

In [33]:
df['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [34]:
X = np.array(embedded_tweet)
y = np.array(df['class'])

In [35]:
smote = SMOTE(sampling_strategy = 'minority')
X,y = smote.fit_resample(X,y)

In [36]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [37]:
model = Sequential()
model.add(Embedding(vocab_size,50,input_length = sentence_length))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(50,return_sequences=False)),
model.add(Dense(3,activation='softmax'))



In [38]:
model.compile(optimizer = 'Adam', loss = 'sparse_categorical_crossentropy',metrics=['accuracy'])

In [39]:
model.summary()

In [40]:
model.fit(X_train,y_train,epochs = 10,batch_size = 32)

Epoch 1/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 28ms/step - accuracy: 0.8063 - loss: 0.4611
Epoch 2/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 28ms/step - accuracy: 0.9459 - loss: 0.1730
Epoch 3/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.9591 - loss: 0.1327
Epoch 4/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 24ms/step - accuracy: 0.9701 - loss: 0.1007
Epoch 5/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 24ms/step - accuracy: 0.9766 - loss: 0.0852
Epoch 6/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 24ms/step - accuracy: 0.9818 - loss: 0.0700
Epoch 7/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.9868 - loss: 0.0500
Epoch 8/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 25ms/step - accuracy: 0.9897 - loss: 0.0370
Epoch 9

<keras.src.callbacks.history.History at 0x26da72b9720>

In [41]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Accuracy : {accuracy * 100}')

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9032 - loss: 0.4854
Model Accuracy : 89.76377844810486


In [42]:
pred = np.argmax(model.predict(X_test), axis = -1)

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step


In [43]:
y_test[:5]

array([1, 1, 0, 1, 2], dtype=int64)

In [44]:
pred[:5]

array([1, 1, 0, 1, 2], dtype=int64)

In [45]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      3812
           1       0.92      0.90      0.91      3807
           2       0.75      0.79      0.77       890

    accuracy                           0.90      8509
   macro avg       0.86      0.87      0.87      8509
weighted avg       0.90      0.90      0.90      8509



In [46]:
print(confusion_matrix(y_test, pred))

[[3514  197  101]
 [ 252 3421  134]
 [ 103   84  703]]


In [47]:
pickle.dump(model,open('model.pkl','wb'))