# Importing Python Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
plt.style.use('ggplot')

## loading the data

In [None]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1", engine="python")
data.columns = ["label", "time", "date", "query", "username", "text"]

## Exploratory data analysis

In [None]:
data.head()

Unnamed: 0,label,time,date,query,username,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [None]:
data.tail()

Unnamed: 0,label,time,date,query,username,text
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [None]:
data.columns

Index(['label', 'time', 'date', 'query', 'username', 'text'], dtype='object')

In [None]:
# length of data
print('lenght of data is', len(data))


lenght of data is 1599999


In [None]:
#shape of data
data.shape

(1599999, 6)

In [None]:
#data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   label     1599999 non-null  int64 
 1   time      1599999 non-null  int64 
 2   date      1599999 non-null  object
 3   query     1599999 non-null  object
 4   username  1599999 non-null  object
 5   text      1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [None]:
#checking null values
np.sum(data.isnull().any(axis=1))

0

# Data preprossing

In [None]:
#Selecting the text and label coloumn

data=data[['text','label']]

In [None]:
#Assigning 1 to Positive sentment 4
data['label'][data['label']==4]=1

In [None]:
data["label"].unique()

array([0, 1], dtype=int64)

In [None]:
#Separating positive and negative tweets
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]

In [None]:
#taking one fourth data so we can run on our machine easily

data_pos = data_pos.iloc[:int(20000)]
data_neg = data_neg.iloc[:int(20000)]



In [None]:
#Combining positive and negative tweets

data = pd.concat([data_pos, data_neg])



In [None]:
#Making statement text in lower case

data['text']=data['text'].str.lower()



In [None]:
data['text']

799999         i love @health4uandpets u guys r the best!! 
800000    im meeting up with one of my besties tonight! ...
800001    @darealsunisakim thanks for the twitter add, s...
800002    being sick can be really cheap when it hurts t...
800003      @lovesbrooklyn2 he has that effect on everyone 
                                ...                        
19995                             one more day of holidays 
19996     feeling so down right now .. i hate you damn h...
19997     geez,i hv to read the whole book of personalit...
19998     i threw my sign at donnie and he bent over to ...
19999     @heather2711 good thing i didn't find any then...
Name: text, Length: 40000, dtype: object

In [None]:
#Cleaning and removing Stop words of english

stopwords_list = stopwords.words('english')



In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [None]:
#Cleaning and removing the above stop words list from the tweet text

STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()



799999                love @health4uandpets u guys r best!!
800000    im meeting one besties tonight! cant wait!! - ...
800001    @darealsunisakim thanks twitter add, sunisa! g...
800002    sick really cheap hurts much eat real food plu...
800003                      @lovesbrooklyn2 effect everyone
Name: text, dtype: object

In [None]:
#Cleaning and removing punctuations


english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [None]:
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text']



799999                   love health4uandpets u guys r best
800000    im meeting one besties tonight cant wait  girl...
800001    darealsunisakim thanks twitter add sunisa got ...
800002    sick really cheap hurts much eat real food plu...
800003                       lovesbrooklyn2 effect everyone
                                ...                        
19995                                      one day holidays
19996                      feeling right  hate damn humprey
19997     geezi hv read whole book personality types emb...
19998      threw sign donnie bent get thingee made sad face
19999     heather2711 good thing find none ones like com...
Name: text, Length: 40000, dtype: object

In [None]:
#leaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [None]:
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text']

799999                   love health4uandpets u guys r best
800000    im meting one besties tonight cant wait girl talk
800001    darealsunisakim thanks twiter ad sunisa got me...
800002    sick realy cheap hurts much eat real fod plus ...
800003                         lovesbroklyn2 efect everyone
                                ...                        
19995                                      one day holidays
19996                        feling right hate damn humprey
19997     gezi hv read whole bok personality types embar...
19998        threw sign donie bent get thinge made sad face
19999     heather271 god thing find none ones like come ...
Name: text, Length: 40000, dtype: object

In [None]:
#Cleaning and removing email

def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)



In [None]:
data['text']= data['text'].apply(lambda x: cleaning_email(x))
data['text']

799999                   love health4uandpets u guys r best
800000    im meting one besties tonight cant wait girl talk
800001    darealsunisakim thanks twiter ad sunisa got me...
800002    sick realy cheap hurts much eat real fod plus ...
800003                         lovesbroklyn2 efect everyone
                                ...                        
19995                                      one day holidays
19996                        feling right hate damn humprey
19997     gezi hv read whole bok personality types embar...
19998        threw sign donie bent get thinge made sad face
19999     heather271 god thing find none ones like come ...
Name: text, Length: 40000, dtype: object

In [None]:
#Cleaning and removing URL's

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)


In [None]:
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text']

799999                   love health4uandpets u guys r best
800000    im meting one besties tonight cant wait girl talk
800001    darealsunisakim thanks twiter ad sunisa got me...
800002    sick realy cheap hurts much eat real fod plus ...
800003                         lovesbroklyn2 efect everyone
                                ...                        
19995                                      one day holidays
19996                        feling right hate damn humprey
19997     gezi hv read whole bok personality types embar...
19998        threw sign donie bent get thinge made sad face
19999     heather271 god thing find none ones like come ...
Name: text, Length: 40000, dtype: object

In [None]:
#Cleaning and removing Numeric numbers

def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)



In [None]:
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].tail()

19995                                     one day holidays
19996                       feling right hate damn humprey
19997    gezi hv read whole bok personality types embar...
19998       threw sign donie bent get thinge made sad face
19999    heather god thing find none ones like come siz...
Name: text, dtype: object

In [None]:
#Getting tokenization of tweet text

tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)



In [None]:
data['text']

799999             [love, healthuandpets, u, guys, r, best]
800000    [im, meting, one, besties, tonight, cant, wait...
800001    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800002    [sick, realy, cheap, hurts, much, eat, real, f...
800003                      [lovesbroklyn, efect, everyone]
                                ...                        
19995                                  [one, day, holidays]
19996                  [feling, right, hate, damn, humprey]
19997     [gezi, hv, read, whole, bok, personality, type...
19998     [threw, sign, donie, bent, get, thinge, made, ...
19999     [heather, god, thing, find, none, ones, like, ...
Name: text, Length: 40000, dtype: object

In [None]:
# Initialize Lemmatizer
lm = nltk.WordNetLemmatizer()

# Function to apply Lemmatization
def lemmatizer_on_text(data):
    return [lm.lemmatize(word) for word in data]

# Applying Lemmatization to the text
data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))

# Display the first 5 results after applying Lemmatization
print(data['text'].head())




799999              [love, healthuandpets, u, guy, r, best]
800000    [im, meting, one, besties, tonight, cant, wait...
800001    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800002    [sick, realy, cheap, hurt, much, eat, real, fo...
800003                      [lovesbroklyn, efect, everyone]
Name: text, dtype: object


In [None]:
#Separating input feature and label

X=data.text
y=data.label

