<a href="https://colab.research.google.com/github/Gaurav-phatkare/BERT_Model-NLP/blob/main/Twitter_Sentiment_Analysis_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!pip install kaggle



In [3]:
!mkdir ~/.kaggle

In [4]:
! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d kazanova/sentiment140

Downloading sentiment140.zip to /content
100% 80.9M/80.9M [00:03<00:00, 30.7MB/s]
100% 80.9M/80.9M [00:03<00:00, 27.7MB/s]


In [7]:
!unzip /content/sentiment140.zip

Archive:  /content/sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [54]:
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='latin-1', header = None)

In [55]:
df.columns=['Sentiment', 'id', 'Date', 'Query', 'User', 'Tweet']

In [56]:
df = df.drop(columns=['id', 'Date', 'Query', 'User'], axis=1)

In [57]:
df.head()

Unnamed: 0,Sentiment,Tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [58]:
# df = df.rename(columns={df.columns[0]: 'target', df.columns[1]: 'id', df.columns[2]: 'date', df.columns[3]: 'query', df.columns[4]: 'username', df.columns[5]: 'content'})

In [59]:
df.shape

(1600000, 2)

In [60]:
# df = df[['target','content']]

In [61]:
df['labels'] = df['Sentiment'].replace([0, 4],['Negative','Positive'])

In [62]:
df.Sentiment.value_counts()

0    800000
4    800000
Name: Sentiment, dtype: int64

In [63]:
df['Sentiment'] = df.Sentiment.replace(4,1)

In [64]:
df.Sentiment.value_counts()

0    800000
1    800000
Name: Sentiment, dtype: int64

#Data Cleaning

In [65]:
import re

hashtags = re.compile(r"^#\S+|\s#\S+")
mentions = re.compile(r"^@\S+|\s@\S+")
urls = re.compile(r"https?://\S+")

def process_text(text):
    text = re.sub(r'http\S+', '', text)
    text = hashtags.sub(' hashtag', text)
    # text = mentions.sub(' entity', text)
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    return text.strip().lower()

In [66]:
df['Tweet'] = df.Tweet.apply(process_text)

In [67]:
df.head()

Unnamed: 0,Sentiment,Tweet,labels
0,0,"- awww, that's a bummer. you shoulda got davi...",Negative
1,0,is upset that he can't update his facebook by ...,Negative
2,0,i dived many times for the ball. managed to sa...,Negative
3,0,my whole body feels itchy and like its on fire,Negative
4,0,"no, it's not behaving at all. i'm mad. why am ...",Negative


In [68]:
df['Tweet'][-5:].values

array(['just woke up. having no school is the best feeling ever',
       'thewdb.com - very cool to hear old walt interviews!  â\x99«',
       'are you ready for your mojo makeover? ask me for details',
       'happy 38th birthday to my boo of alll time!!! tupac amaru shakur',
       'happy hashtag'], dtype=object)

In [69]:
df.drop_duplicates(subset='Tweet',inplace=True)

In [70]:
df.shape

(1541247, 3)

In [71]:
text_len = []
for text in df.Tweet:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [72]:
df['text_len'] = text_len

In [73]:
df.head()

Unnamed: 0,Sentiment,Tweet,labels,text_len
0,0,"- awww, that's a bummer. you shoulda got davi...",Negative,17
1,0,is upset that he can't update his facebook by ...,Negative,21
2,0,i dived many times for the ball. managed to sa...,Negative,17
3,0,my whole body feels itchy and like its on fire,Negative,10
4,0,"no, it's not behaving at all. i'm mad. why am ...",Negative,20


In [77]:
df = df[df['text_len'] > 4]

In [78]:
df.shape

(1394620, 4)

In [79]:
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)

In [80]:
training_data.shape, testing_data.shape

((1115696, 4), (278924, 4))

In [85]:
import tensorflow as tf
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.3 MB/s[0m eta [36m0:00:0

In [86]:
from transformers import BertTokenizerFast
from transformers import TFBertModel

In [87]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [88]:
token_lens = []

for txt in training_data['Tweet'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

max_len=np.max(token_lens)

In [89]:
max_len

212

In [90]:
training_data['token_lens'] = token_lens

In [104]:
training_df = training_data.sort_values(by='token_lens', ascending=False)

In [114]:
training_df[:1000]

Unnamed: 0,Sentiment,Tweet,labels,text_len,token_lens
709142,0,à¶¶à¶½à·?à¶à·à¶± à·à¶»à· à¶ºà¶±à·à¶±à· à...,Negative,20,212
1188208,1,à¤à¤¾à¤¨ à¤à¤¤à¤¾ à¥¨ à¤¦à¤¿à¤µà¤¸ à¤¸à¥?à¤...,Positive,25,175
443960,0,firefly ñ?ñð¾ ðºð°ðºð°ñ?-ñð¾ ñ?ð¾ð²ðµññðµð...,Negative,19,173
325649,0,"ð´ð¸ñ?ðº ð½ðµ ð¿ñð¾ð²ðµññ?ð», ð½ðµ ð·ð½ð°ñ....",Negative,21,170
368825,0,ð¥ð¼ð¼... ð¯ ð±ñ ñð¾ð¶ðµ ð½ðµ ð¾ñðºð°ð·ð°ð»...,Negative,20,162
...,...,...,...,...,...
738105,0,gd owes me hugs..... ..... ugh no its okay ...,Negative,25,59
969284,1,"secondly, i saw the funniest musical ever...&q...",Positive,25,59
1579263,1,cheaper comics from today. $3.99 comics now â£...,Positive,23,59
696353,0,just cut my hair!omg!so shortt.haha.so ur not ...,Negative,22,59


In [115]:
training_df = training_df.iloc[1500:]

In [117]:
training_df.shape

(1114196, 5)

In [118]:
training_df = training_df.sample(frac=1).reset_index(drop=True)

In [121]:
token_lens_test = []

for txt in testing_data['Tweet'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens_test.append(len(tokens))

max_len=np.max(token_lens_test)

In [122]:
max_len

230

In [127]:
testing_data['token_lens'] = token_lens_test

In [130]:
testing_df = testing_data.sort_values(by='token_lens', ascending=False)
testing_df.head(1500)

Unnamed: 0,Sentiment,Tweet,labels,text_len,token_lens
1582941,1,5 days till new top gear î?î?î?î?î?î?î?...,Positive,7,230
1324105,1,ð?ðµ ð¼ð¾ð³ñ ñ?ðºð°ð·ð°ññ ð¾ ñð¾ð¿-ð¼ðµð½ð...,Positive,21,172
1325631,1,ññ ñ?ðºð¾ñðµðµ ð¿ð¾ñðµññ?ðµññ ð¸ð»ð¸ ð¿...,Positive,18,167
1484492,1,ð´ð°ð¶ðµ ð¸ ð½ðµ ð¿ð¾ð´ð¾ð·ñðµð²ð°ð¹ ð¼ðµð½ñ?...,Positive,20,163
248973,0,"ð?ññ, openoffice ð²ñð±ðµñ?ð¸ð» ñ?ð²ð¾ðµð¹ ð...",Negative,17,159
...,...,...,...,...,...
880751,1,me: &quot;oh hey we are having a laptop party!...,Positive,21,48
374573,0,woohoo-ing over the good news i just got and b...,Negative,26,48
976412,1,"oui oui and i saw you, and i said to bec 'hey...",Positive,29,48
1068392,1,"omg, i saw the &quot;trailer&quot; of new moon...",Positive,27,48


In [133]:
testing_df = testing_df.iloc[1500:]
testing_df.head(10)

Unnamed: 0,Sentiment,Tweet,labels,text_len,token_lens
503464,0,finnaly back home but still got school tommoro...,Negative,29,44
40700,0,i can almost see it.. that dream i'm dreaming....,Negative,21,44
864955,1,prom was crackin'! thanks bubba!!! ria/brina +...,Positive,21,44
418257,0,lol i wish i could tell ya'll wat was wrong..b...,Negative,28,44
1519351,1,"yay, the ide 2.5/3.5/sata -&gt; usb device i g...",Positive,25,44
120354,0,i think sumbody got in my twitter shit...an ha...,Negative,26,44
101592,0,"slept 10 hours,but still feel tired...lost 8 p...",Negative,23,44
1199151,1,you hot stuff i don't blame l.a. for loving s...,Positive,21,44
545985,0,i miss rene.. damn! 4 fucking hours from mende...,Negative,25,44
1431657,1,ohh i knoww i'm not on my computerr in on my p...,Positive,33,44


In [134]:
testing_df = testing_df.sample(frac=1).reset_index(drop=True)

In [136]:
testing_df.head(10)

Unnamed: 0,Sentiment,Tweet,labels,text_len,token_lens
0,0,(shh..) as much as i hate where the exchange r...,Negative,24,34
1,0,i'm bummed i'm gonna miss you rocking out with...,Negative,10,19
2,0,"shakin', sore throat, feel like vomiting..... ...",Negative,8,23
3,0,wants to kick all my assignments in the face.....,Negative,16,23
4,0,doesn't want this night to end it means i hav...,Negative,13,17
5,0,you're gonna hate my guts for the email i'm ab...,Negative,14,23
6,1,awww baby you know that's the sacrifice for fa...,Positive,18,32
7,1,it's two thousand and *what*? omgomgomgomg i'm...,Positive,11,25
8,1,yes. very inspiring indeed. left or right eye?,Positive,8,13
9,1,wait....lol good luck today hahaha love you h...,Positive,8,20


In [137]:
training_df.Sentiment.value_counts()

0    568761
1    545435
Name: Sentiment, dtype: int64

In [138]:
X = training_df['Tweet'].values
y = training_df['Sentiment'].values

In [139]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [140]:
X_test = testing_df['Tweet'].values
y_test = testing_df['Sentiment'].values

In [145]:
X_train.shape, X_val.shape, X_test.shape

((1002776,), (111420,), (275919,))

# One Hot Encoding

In [143]:
y_train_le = y_train.copy()
y_valid_le = y_val.copy()
y_test_le = y_test.copy()

In [144]:
from sklearn import preprocessing

OneHot = preprocessing.OneHotEncoder()
y_train = OneHot.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = OneHot.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = OneHot.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

# Baseline Model With Naive Bayes and TFIDF

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


# Create tokenization and modelling pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf',MultinomialNB())
])

# fit the pipeline to the training data

model_0.fit(X_train, y_train_le)

In [147]:
baseline_score = model_0.score(X_val, y_val)

In [148]:
baseline_score

0.7642523783880811

In [149]:
test_pred = model_0.predict(X_test)

In [152]:
y_test_le

array([0, 0, 0, ..., 0, 1, 1])

In [154]:
# make helper function for accuracy precision recall and f1 score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

print('\tClassification Report for Naive Bayes:\n\n',classification_report(y_test_le, test_pred, target_names = ['Negative', 'Positive']))




	Classification Report for Naive Bayes:

               precision    recall  f1-score   support

    Negative       0.76      0.80      0.78    141235
    Positive       0.78      0.73      0.75    134684

    accuracy                           0.77    275919
   macro avg       0.77      0.76      0.77    275919
weighted avg       0.77      0.77      0.77    275919

