### Importing Packages

In [243]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, SimpleRNN
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

### Downloading the resources

In [10]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Muthu
[nltk_data]     Palaniappan M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Muthu Palaniappan
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading Data

In [2]:
data = pd.read_csv("labeled_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
data = data[['tweet','class']]

In [6]:
data

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1
...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1
24779,"you've gone and broke the wrong heart baby, an...",2
24780,young buck wanna eat!!.. dat nigguh like I ain...,1
24781,youu got wild bitches tellin you lies,1


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   24783 non-null  object
 1   class   24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


### TweetProcessing

```word_tokenize``` has split ```#dummysmiley as '#' and 'dummysmiley'```, while ```TweetTokenizer``` didn't, as ```'#dummysmiley'```. TweetTokenizer is built mainly for analyzing tweets.

In [16]:
stop_words = set(stopwords.words('english'))
len(stop_words)

179

In [17]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

#### Tokenize

In [71]:
tokens = tokenizer.tokenize("Can't believe it! @Muthupa just posted this amazing #welcome to snuc 🤩 https://snu.com #excited")
tokens                            

["Can't",
 'believe',
 'it',
 '!',
 '@Muthupa',
 'just',
 'posted',
 'this',
 'amazing',
 '#welcome',
 'to',
 'snuc',
 '🤩',
 'https://snu.com',
 '#excited']

#### Removing url's and stopwords

In [43]:
tokens = [re.sub(r'https?://\S+|www\.\S+', '', token) for token in tokens]
tokens

["Can't",
 'believe',
 'it',
 '!',
 '@Muthupa',
 'just',
 'posted',
 'this',
 'amazing',
 '#welcome',
 'to',
 'snuc',
 '🤩',
 '',
 '#excited']

In [44]:
tokens = [token for token in tokens if token not in stop_words]
tokens

["Can't",
 'believe',
 '!',
 '@Muthupa',
 'posted',
 'amazing',
 '#welcome',
 'snuc',
 '🤩',
 '',
 '#excited']

#### Lemmatize

In [35]:
tokens = [lemmatizer.lemmatize(token) for token in tokens]
tokens

["Can't",
 'believe',
 '!',
 '@Muthupa',
 'posted',
 'amazing',
 '#welcome',
 'snuc',
 '🤩',
 '',
 '#excited']

#### Emoji Mapping

In [58]:
tokens = [emoji.demojize(token) for token in tokens]
tokens

["Can't",
 'believe',
 '!',
 '@Muthupa',
 'posted',
 'amazing',
 '#welcome',
 'snuc',
 ':star-struck:',
 '',
 '#excited']

#### Removing Userhandles

In [67]:
tokens = [token for token in tokens if not token.startswith('@')]
tokens

["Can't",
 'believe',
 '!',
 'posted',
 'amazing',
 '#welcome',
 'snuc',
 ':star-struck:',
 '',
 '#excited']

In [68]:
processed_tweet = ' '.join(tokens)
processed_tweet

"Can't believe ! posted amazing #welcome snuc :star-struck:  #excited"

In [113]:
def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r'https?://\S+|www\.\S+', '', token) for token in tokens]
    tokens = [re.sub(r'\W', ' ', token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [emoji.demojize(token) for token in tokens]
    contractions = {
        "can't": "cannot",
        "won't": "will not",
    }
    tokens = [contractions[token] if token in contractions else token for token in tokens]
    tokens = [token for token in tokens if not token.startswith('@')]
    processed_tweet = ' '.join(tokens)
    
    return processed_tweet

#### Pre-Processing Function

In [72]:
tweet_pre_process("Can't believe it! @Muthupa just posted this amazing #welcome to snuc 🤩 https://snu.com #excited")

"can't believe ! posted amazing #welcome snuc :star-struck:  #excited"

In [114]:
data['pre_process_tweet'] = data['tweet'].apply(preprocess_tweet)

In [118]:
data

Unnamed: 0,tweet,class,pre_process_tweet
0,!!! RT @mayasolovely: As a woman you shouldn't...,2,rt woman complain cleaning house man always ta...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,rt boy dat cold tyga dwn bad cuffin dat hoe 1s...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,rt dawg rt 0sbaby4life ever fuck bitch start c...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,rt look like tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,rt shit hear might true might faker bitch told ya
...,...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1,muthaf lie right tl trash mine bible scripture...
24779,"you've gone and broke the wrong heart baby, an...",2,gone broke wrong heart baby drove redneck crazy
24780,young buck wanna eat!!.. dat nigguh like I ain...,1,young buck wanna eat dat nigguh like aint fuck...
24781,youu got wild bitches tellin you lies,1,youu got wild bitch tellin lie


## Word2Vec - CBOW

In [212]:
cbow = Word2Vec(data['pre_process_tweet'].values.tolist(), vector_size=100, window=5, min_count=2, sg=0)
vocab = cbow.wv.index_to_key

In [213]:
def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

In [214]:
cbow_array = []
for sentence in data['pre_process_tweet'].values.tolist():
    cbow_array.append(get_mean_vector(cbow, sentence))

In [216]:
len(cbow_array)

24782

## Word2Vec - Skipgram

In [219]:
sg = Word2Vec(data['pre_process_tweet'].values.tolist(), vector_size=100, window=5, min_count=2, sg=1)
vocab = sg.wv.index_to_key

In [220]:
def get_mean_vector(model, sentence):
    words = [word for word in sentence if word in vocab]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    return np.zeros((100,))

In [221]:
sg_array = []
for sentence in data['pre_process_tweet'].values.tolist():
    sg_array.append(get_mean_vector(sg, sentence))

In [223]:
len(sg_array)

24782

## CNN - Representation

### Dataset_Preparation

In [175]:
tweets = data['pre_process_tweet'].values
labels = data['class'].values

In [176]:
tweets

array(['rt woman complain cleaning house man always take trash',
       'rt boy dat cold tyga dwn bad cuffin dat hoe 1st place',
       'rt dawg rt 0sbaby4life ever fuck bitch start cry confused shit',
       ..., 'young buck wanna eat dat nigguh like aint fuckin dis',
       'youu got wild bitch tellin lie',
       'ruffled ntac eileen dahlia beautiful color combination pink orange yellow white coll'],
      dtype=object)

In [177]:
labels

array([2, 1, 1, ..., 1, 1, 2], dtype=int64)

#### Tokenization and Padding

In [178]:
max_len = data['pre_process_tweet'].str.len().max()

In [179]:
max_len

138

In [180]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)

In [181]:
tokenizer.word_docs

defaultdict(int,
            {'house': 130,
             'rt': 7156,
             'take': 317,
             'man': 529,
             'trash': 1115,
             'woman': 247,
             'always': 279,
             'cleaning': 6,
             'complain': 28,
             'dwn': 2,
             'tyga': 16,
             'boy': 261,
             'hoe': 4001,
             '1st': 37,
             'dat': 292,
             'cuffin': 10,
             'bad': 548,
             'cold': 64,
             'place': 75,
             '0sbaby4life': 29,
             'start': 158,
             'fuck': 1363,
             'confused': 27,
             'shit': 1221,
             'ever': 262,
             'bitch': 10687,
             'cry': 110,
             'dawg': 43,
             'tranny': 49,
             'like': 2595,
             'look': 562,
             'faker': 3,
             'ya': 432,
             'hear': 64,
             'told': 144,
             'true': 65,
             'might': 110,
          

In [182]:
sequences = tokenizer.texts_to_sequences(tweets)
sequences

[[2, 93, 886, 2969, 196, 28, 76, 61, 13],
 [2, 88, 67, 424, 1413, 5837, 27, 2069, 67, 3, 695, 370],
 [2, 618, 2, 860, 90, 10, 1, 169, 239, 925, 11],
 [2, 25, 4, 550],
 [2, 11, 419, 228, 412, 228, 4639, 1, 177, 36],
 [11, 681, 682, 1144, 394, 35, 21, 3],
 [404, 34, 156, 1, 12, 117, 11, 83],
 [69, 425, 107, 1, 395, 9, 573, 19],
 [228, 8, 36, 1, 38, 185],
 [2243, 3333, 1145, 5838, 1],
 [8182, 1, 1571, 302, 14, 1109, 1481, 4, 128],
 [3334, 1035, 1, 1035, 1572],
 [3, 390, 1414, 325, 18, 656],
 [27, 1, 99, 4],
 [1, 8],
 [1, 6, 339],
 [1, 1782, 574],
 [1, 17],
 [1, 8, 234, 575, 165],
 [114, 956, 27, 1],
 [180, 1, 279, 57, 174],
 [2244, 1, 4, 3335],
 [279, 37, 3, 986, 380],
 [10, 1, 70, 46, 200, 53, 4640, 349, 58, 10, 656],
 [12, 36, 1, 987, 4641, 8183, 1079],
 [5, 619, 4, 1671, 529],
 [3, 1036],
 [620, 5, 2245, 1783, 396, 5, 1231],
 [30, 3870, 1, 10, 8184],
 [1037, 79, 20, 1, 555],
 [3871, 123, 1, 72, 1, 5, 3872],
 [26, 39, 38, 123, 123, 1, 51, 8, 1232],
 [98, 58, 1289, 86, 5, 6, 157],
 [988,

In [183]:
tweet_data = pad_sequences(sequences,maxlen=max_len)
tweet_data

array([[    0,     0,     0, ...,    76,    61,    13],
       [    0,     0,     0, ...,     3,   695,   370],
       [    0,     0,     0, ...,   239,   925,    11],
       ...,
       [    0,     0,     0, ...,    98,    95,   230],
       [    0,     0,     0, ...,     1,  1660,   332],
       [    0,     0,     0, ...,    94,    49, 16840]])

In [184]:
tweet_data.shape

(24782, 138)

### Model Building

In [185]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

16841

In [186]:
cnn_representation_dim = 100

In [187]:
model = Sequential()
model.add(Embedding(vocab_size, cnn_representation_dim, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [188]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 138, 100)          1684100   
                                                                 
 conv1d_2 (Conv1D)           (None, 134, 128)          64128     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_3 (Dense)             (None, 128)               16512     
                                                                 
 dense_4 (Dense)             (None, 3)                 387       
                                                                 
Total params: 1765127 (6.73 MB)
Trainable params: 1765127 (6.73 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [189]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training and Get Representation for Down Stream Task

In [192]:
model.fit(tweet_data, labels, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x178ecceca90>

### Getting Represenation From the Model

In [203]:
cnn_embeddings = model.layers[0].get_weights()[0]
cnn_embeddings

array([[-0.02070224, -0.00217342, -0.01971866, ...,  0.00169   ,
        -0.00793593,  0.00185298],
       [ 0.1213275 , -0.11898055, -0.06785338, ..., -0.08902466,
        -0.08240429,  0.14615555],
       [-0.03989461, -0.0157681 ,  0.04168139, ...,  0.01872528,
        -0.02950534, -0.01898918],
       ...,
       [ 0.02673351,  0.02214083, -0.01881211, ..., -0.04175155,
         0.02514503,  0.03081172],
       [-0.01907891, -0.01436325,  0.02728608, ...,  0.02839203,
        -0.00134127, -0.00377864],
       [-0.01459378,  0.0241019 , -0.03598715, ..., -0.04778757,
        -0.00159583, -0.04600607]], dtype=float32)

In [207]:
cnn_embeddings.shape

(16841, 100)

## Downstream Evaluation

In [224]:
def train_and_evaluate_decision_tree(x_train, x_test, y_train, y_test, representation):
    
    dtclassifier = DecisionTreeClassifier(random_state=9,max_depth=5)
    dtclassifier.fit(x_train, y_train)
    y_pred = dtclassifier.predict(x_test)

    print(f"\nMetrics for {representation}:")
    print(f"Model Score: {dtclassifier.score(x_train,y_train)}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [228]:
def get_dataset_prep(array,labels):
    x_train,x_test,y_train,y_test = train_test_split(array,labels,test_size=0.2,random_state=2)
    return x_train,x_test,y_train,y_test

### Decision Tree - CBOW

In [229]:
x_train,x_test,y_train,y_test = get_dataset_prep(cbow_array,labels)

In [237]:
train_and_evaluate_decision_tree(x_train, x_test, y_train, y_test, "DT-CBOW")


Metrics for DT:
Model Score: 0.7830517023959647
Accuracy: 0.7732499495662699
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.03      0.05       279
           1       0.78      0.99      0.87      3835
           2       0.45      0.05      0.09       843

    accuracy                           0.77      4957
   macro avg       0.59      0.35      0.34      4957
weighted avg       0.71      0.77      0.69      4957



### Decision Tree - Skipgram

In [238]:
x_train,x_test,y_train,y_test = get_dataset_prep(sg_array,labels)

In [239]:
train_and_evaluate_decision_tree(x_train, x_test, y_train, y_test, "DT-SkipGram")


Metrics for DT-SkipGram:
Model Score: 0.7810844892812105
Accuracy: 0.7758725035303611
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.03      0.05       279
           1       0.79      0.98      0.87      3835
           2       0.50      0.12      0.19       843

    accuracy                           0.78      4957
   macro avg       0.57      0.37      0.37      4957
weighted avg       0.72      0.78      0.71      4957



## RNN - Representation

In [240]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

16841

In [241]:
rnn_representation_dim = 100

In [244]:
model = Sequential()
model.add(Embedding(vocab_size, rnn_representation_dim, input_length=max_len))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [245]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 138, 100)          1684100   
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                4256      
                                                                 
 dense_5 (Dense)             (None, 128)               4224      
                                                                 
 dense_6 (Dense)             (None, 3)                 387       
                                                                 
Total params: 1692967 (6.46 MB)
Trainable params: 1692967 (6.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [246]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Model Training

In [247]:
model.fit(tweet_data, labels, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x178f2f7dd90>

### Getting Representation from Model

In [248]:
rnn_embeddings = model.layers[0].get_weights()[0]
rnn_embeddings

array([[ 0.08894386,  0.05566104,  0.0040675 , ...,  0.00197502,
        -0.05583239, -0.00615073],
       [-0.0493452 , -0.17989728, -0.08420745, ...,  0.27601382,
        -0.00838685,  0.21541148],
       [-0.01406219, -0.05131751, -0.01463335, ...,  0.03848261,
         0.00777662,  0.00761374],
       ...,
       [-0.01450269,  0.02662939, -0.01659643, ..., -0.02704675,
         0.04928735, -0.00095978],
       [ 0.02304536, -0.00220132,  0.02080033, ..., -0.01050389,
        -0.02484881, -0.00413143],
       [-0.04524095,  0.01009665,  0.04885234, ...,  0.03640784,
        -0.01080083, -0.01088579]], dtype=float32)