In [None]:
import tensorflow as tf
import numpy as np

!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import zipfile
import shutil
import os

root_dir = '/content'
data_dir = '/content/corona'

if os.path.exists(data_dir):
  shutil.rmtree(data_dir)

with zipfile.ZipFile(os.path.join(root_dir,'drive/MyDrive/data/corona_tweets.zip'),'r') as file:
  file.extractall(data_dir)


In [None]:
import pandas as pd

train_data = pd.read_csv(os.path.join(data_dir,'Corona_NLP_train.csv'),encoding='latin-1')
train_data = train_data[['OriginalTweet','Sentiment']]


In [None]:
print(train_data.head(5))


                                       OriginalTweet           Sentiment
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral
1  advice Talk to your neighbours family to excha...            Positive
2  Coronavirus Australia: Woolworths to give elde...            Positive
3  My food stock is not the only one which is emp...            Positive
4  Me, ready to go at supermarket during the #COV...  Extremely Negative


In [None]:
print(train_data.shape)


(41157, 2)


In [None]:
print(train_data.groupby('Sentiment').count())


                    OriginalTweet
Sentiment                        
Extremely Negative           5481
Extremely Positive           6624
Negative                     9917
Neutral                      7713
Positive                    11422


In [None]:
train_data['Sentiment'] = train_data['Sentiment'].replace({'Extremely Positive': 0,'Positive': 1,
                                                           'Neutral': 2, 'Negative': 3, 'Extremely Negative': 4})

print(train_data['Sentiment'].head(5))

0    2
1    1
2    1
3    1
4    4
Name: Sentiment, dtype: int64


In [None]:
print(train_data.isnull().sum(),'\n')

print(train_data.duplicated(subset = ['OriginalTweet']).value_counts())

train_data.drop_duplicates(subset = 'OriginalTweet',inplace = True)

OriginalTweet    0
Sentiment        0
dtype: int64 

False    41157
dtype: int64


In [None]:
test_data = pd.read_csv(os.path.join(data_dir,'Corona_NLP_test.csv'),encoding='latin-1')
test_data = test_data[['OriginalTweet','Sentiment']]

test_data['Sentiment'] = test_data['Sentiment'].replace({'Extremely Positive': 0,'Positive': 1,
                                                           'Neutral': 2, 'Negative': 3, 'Extremely Negative': 4})

test_data.drop_duplicates(subset = 'OriginalTweet',inplace = True)


In [None]:
from sklearn.model_selection import train_test_split
rs = [45,35,120,37,434,293]

train_data,val_data = train_test_split(train_data,test_size = 0.1,random_state = rs[2],stratify = train_data['Sentiment'])

train_data.reset_index(drop = True, inplace = True)
val_data.reset_index(drop = True, inplace = True)
test_data.reset_index(drop = True, inplace = True)

print(train_data.shape,val_data.shape,test_data.shape)

(37041, 2) (4116, 2) (3798, 2)


In [None]:
#preprocess

import re

def preprocessing(text):
  com = re.compile("[^\w\d'\. ]")
  http = re.compile('https://[^ ]+(/[^ ])+')
  tag = re.compile('@[^ ]+')
  tag2 = re.compile('#[^ ]+')
  text = http.sub('',text)
  text = tag.sub('',text)
  text = tag2.sub('',text)
  text = com.sub('',text)

  return text

print(train_data.tail(20),'\n')

train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(func = preprocessing)
val_data['OriginalTweet'] = val_data['OriginalTweet'].apply(func = preprocessing)
test_data['OriginalTweet'] = test_data['OriginalTweet'].apply(func = preprocessing)

print(train_data.tail(20))


                                           OriginalTweet  Sentiment
37021  While there may be short term effect due to lo...          0
37022  MOL, a Hungary based lube manufacturer, has tr...          2
37023  From our COVid19 resource guide, Interim Guida...          1
37024  Critter Sitters is now offering Grocery Shoppi...          1
37025  List of aisles empty at the grocery store: 1) ...          3
37026  #Germany #Coronavirus\r\r\n\r\r\nConsumer advo...          4
37027  So I wiped the cart down but then I touched th...          0
37028  These #charges are never in favour of the cons...          3
37029  Chicago consumers be aware of consumer fraud r...          4
37030  ?? Weaker consumer confidence \r\r\n?? High un...          4
37031  sigh RIP online shopping for the next few mont...          2
37032  Covid-19 lockdown sucks. But these gas prices,...          3
37033  @Bogs4NY They're escaping NY and invading thei...          1
37034  President Trump just said, ÂWe donÂt ha

In [None]:
from transformers.pipelines import text_classification
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from transformers import BertTokenizer

pre_train = ['bert-base-uncased','bert-large-uncased']

Tokenizer = BertTokenizer.from_pretrained(pre_train[0])
LemMatizer = WordNetLemmatizer()
Stopword = stopwords.words('english')

def bertTokenizer(text):
  return Tokenizer.tokenize(text)

def LemMatize(text):
  new_text = []
  for t in text:
    new_text.append(LemMatizer.lemmatize(t))
  return new_text

def clear_noise(text):
  texts = []
  alp = re.compile('[a-zA-Z0-9.]')
  for t in text:
    if alp.fullmatch(t) == None:
      texts.append(t)
  return texts

def stopwords(text):
  texts = []
  for t in text:
    if t not in Stopword:
      texts.append(t)
  return texts

def preprocess(text):
  text = bertTokenizer(text)
  #text = LemMatize(text)
  #text = clear_noise(text)
  #text = stopwords(text)

  return text

def make_sentence(text):
  return ' '.join(text)


In [None]:

train_data.OriginalTweet = train_data.OriginalTweet.apply(func = preprocess)
val_data.OriginalTweet = val_data.OriginalTweet.apply(func = preprocess)
test_data.OriginalTweet = test_data.OriginalTweet.apply(func = preprocess)

print(train_data['OriginalTweet'].head(20),'\n')


In [None]:
def to_id(text):
  return Tokenizer.convert_tokens_to_ids(text)

train_data.OriginalTweet = train_data.OriginalTweet.apply(func = to_id)
val_data.OriginalTweet = val_data.OriginalTweet.apply(func = to_id)
test_data.OriginalTweet = test_data.OriginalTweet.apply(func = to_id)

print(train_data['OriginalTweet'])

In [None]:
#padding

from tensorflow.keras.utils import pad_sequences

applen = train_data.OriginalTweet.apply(func = len)
max_len = applen.max()
avg_len = applen.mean()
print(max_len)
print(avg_len)

x_train = pad_sequences(train_data.OriginalTweet,maxlen=max_len,padding='pre')
x_val = pad_sequences(val_data.OriginalTweet,maxlen=max_len,padding='pre')
x_test = pad_sequences(test_data.OriginalTweet,maxlen=max_len,padding='pre')

print(x_train)

In [None]:

y_train = train_data.Sentiment
y_val = val_data.Sentiment
y_test = test_data.Sentiment


In [None]:

from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Dense, Activation, LSTM, Embedding, GRU

word_count = Tokenizer.vocab_size
label_count = len(train_data['Sentiment'].unique())
print(word_count)
print(label_count)

model = Sequential([
    Embedding(word_count+1,64,input_length = max_len,embeddings_regularizer = regularizers.L2(0.002)),
    #LSTM(140,dropout = 0.2,activation='tanh'),
    GRU(140,dropout = 0.2,activation='tanh'),
    Dense(label_count,activation = 'softmax')
])


In [None]:

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4),
              metrics = ['accuracy'])

model.summary()


In [None]:

from tensorflow.keras.callbacks import EarlyStopping , ReduceLROnPlateau

earlystop = EarlyStopping(monitor = 'val_loss',patience = 4,restore_best_weights = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', patience = 2, factor = 0.2,cooldown = 5)

model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs = 20,batch_size = 32,callbacks = [earlystop,reduce_lr])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f76314d3950>

In [None]:

model.evaluate(x_test,y_test)




[0.954431414604187, 0.711690366268158]