# Assignment project 6 : Sentiment analysis on 1.6 million tweets.
This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment.

It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY. user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

Dataset link : https://www.kaggle.com/kazanova/sentiment140

In [1]:
# lets first import all the neccessary libraries
import pandas as pd
import numpy as np
import re           # re= regular expression
import string

# methods and stop words text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


# English stopwords

In [2]:
#  creating a stopwords set and store them to a variable 
nltk.download('stopwords')
stop_words= set(stopwords.words('english') )
stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manjula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

# Load the dataset 

In [42]:
# reading dataset 
tweet_df= pd.read_csv('training.1600000.processed.noemoticon.csv', encoding= 'latin1',header= None )
tweet_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [43]:
tweet_df.shape

(1600000, 6)

In [44]:
tweet_df[0].unique()

array([0, 4], dtype=int64)

In [45]:
tweet_df.drop(columns= [1,2,3,4], axis=1, inplace= True)

In [46]:
tweet_df.columns= ['Label', 'Tweets']
tweet_df.head()

Unnamed: 0,Label,Tweets
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [47]:
tweet_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   Label   1600000 non-null  int64 
 1   Tweets  1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [48]:
tweet_df.Label.value_counts()              # here we have 80k negative and 80 postive tweets

0    800000
4    800000
Name: Label, dtype: int64

In [49]:
len(tweet_df)                  # this is too much of data and im not sure if my machine will be able to process it 

1600000

In [50]:
# i will first segregate the positive and negative tweets 
df_pos= tweet_df[tweet_df['Label'] == 4]
df_neg= tweet_df[tweet_df['Label'] == 0]
print(len(df_pos), len(df_neg))

800000 800000


In [51]:
# Only retaining 1/8th of our data from each output group coz im unsure about my machines power
df_pos = df_pos.iloc[:int(len(df_pos)/80)]
df_neg = df_neg.iloc[:int(len(df_neg)/80)]
print(len(df_pos), len(df_neg))


10000 10000


In [52]:
# concate both positive and negative tweets and store tehm back to one df
tweet_df1= pd.concat([df_pos, df_neg])
len(tweet_df1)

20000

In [53]:
tweet_df1.head(2)

Unnamed: 0,Label,Tweets
800000,4,I LOVE @Health4UandPets u guys r the best!!
800001,4,im meeting up with one of my besties tonight! ...


In [54]:
#  lets first separate our features and labels to x, y
x= tweet_df1['Tweets'].to_list()       # to_list will get teh text in list and not in df 
y= tweet_df1['Label'].to_list()

In [55]:
x[:5], y[:5]

(['I LOVE @Health4UandPets u guys r the best!! ',
  'im meeting up with one of my besties tonight! Cant wait!!  - GIRL TALK!!',
  '@DaRealSunisaKim Thanks for the Twitter add, Sunisa! I got to meet you once at a HIN show here in the DC area and you were a sweetheart. ',
  'Being sick can be really cheap when it hurts too much to eat real food  Plus, your friends make you soup',
  '@LovesBrooklyn2 he has that effect on everyone '],
 [4, 4, 4, 4, 4])

# Preprocessing the Tweet text 
1. Casing - handeling conversion of uppercase/lowercase
2. Noise Removal - unwanted charectors such as htmltags, punctuation marks, special char,white space etc
3. Tokenization - to convert all texts/tweets into tokens(all tokens would be words separated by spaces   
4. Stopwords removal - some words don't contribute much to ML model so remove them(they dont contain any important significance)
5. Text Normalization (Stemming/lemmatization)- this is based on stemming and lemmatization


In [56]:
# writing a function for preprocessing

def preprocess_tweet_text(tweet):
    #convert all text to lower case
    tweet= tweet.lower()
    
    #remove any urls
    tweet= re.sub(r"www\S+ |http\S+ |https\S+", "", tweet, flags=re.MULTILINE)
    
    #remove user @ reference and # from the tweet
    tweet= re.sub(r"\@\w+ |\#", "", tweet)
    
    #remove punctuations
    tweet= tweet.translate(str.maketrans("", "", string.punctuation))  # translate method will create a mapping table for punctuations with space
    
    #remove stopwords ; before removing stopwords we need to convert tweets to tokens
    tweet_tokens= word_tokenize(tweet)
    filterted_words= [word for word in tweet_tokens if word not in stop_words]
    
    #stemming 
    ps= PorterStemmer()
    stemmed_words= [ps.stem(word) for word in filterted_words]
    
    #lemmatizing 
    lammatizer= WordNetLemmatizer()
    lemma_words= [lammatizer.lemmatize(word, pos= 'a') for word in stemmed_words]
    
    return " ".join(lemma_words)
    
    
    

In [57]:
# the preprocessing is done lets see how the example works on it
preprocess_tweet_text("Hi arti, how are you preparing for mock interview?")


'hi arti prepar mock interview'

In [58]:
x[1]

'im meeting up with one of my besties tonight! Cant wait!!  - GIRL TALK!!'

In [59]:
preprocess_tweet_text(x[1])            # This is what i get after preprocess is done

'im meet one besti tonight cant wait girl talk'

In [67]:
tweet_df1['Cleaned_tweets']= tweet_df1['Tweets'].apply(preprocess_tweet_text)

# show the cleaned text 
tweet_df1.head()


Unnamed: 0,Label,Tweets,Cleaned_tweets
800000,4,I LOVE @Health4UandPets u guys r the best!!,love u guy r best
800001,4,im meeting up with one of my besties tonight! ...,im meet one besti tonight cant wait girl talk
800002,4,"@DaRealSunisaKim Thanks for the Twitter add, S...",thank twitter add sunisa got meet hin show dc ...
800003,4,Being sick can be really cheap when it hurts t...,sick realli cheap hurt much eat real food plu ...
800004,4,@LovesBrooklyn2 he has that effect on everyone,effect everyon


# Vectorization Tokens

In [68]:
# we need to convert the tokens to vectors coz machine can only read num data

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
text_counts=cv.fit_transform(tweet_df1['Cleaned_tweets'])
text_counts



<20000x19375 sparse matrix of type '<class 'numpy.int64'>'
	with 141234 stored elements in Compressed Sparse Row format>

In [70]:
# text_counts[0].toarray()            # to_array will give the matrix 

<1x19375 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [69]:
tweet_df1['Label'].value_counts()

4    10000
0    10000
Name: Label, dtype: int64

In [86]:
# here X will be the features we extracted using vectorization

X= text_counts
Y= tweet_df1['Label']

Y=Y.map({0:0,4:1})                    # mapping 0-> 0 (negative), 4-> 1 (positive)
Y

800000    1
800001    1
800002    1
800003    1
800004    1
         ..
9995      0
9996      0
9997      0
9998      0
9999      0
Name: Label, Length: 20000, dtype: int64

In [87]:
Y.value_counts()

1    10000
0    10000
Name: Label, dtype: int64

In [88]:
# lets split X and Y using train test split. 

x_train, x_test, y_train, y_test= train_test_split(X, Y, test_size=0.2, random_state=50)

In [89]:
# x_train[:5], y_train[:5]         # just to compare the train and its result/target

In [107]:
x_test[:5], y_test[:5]

(<5x19375 sparse matrix of type '<class 'numpy.int64'>'
 	with 42 stored elements in Compressed Sparse Row format>,
 807808    1
 804702    1
 808693    1
 809085    1
 6689      0
 Name: Label, dtype: int64)

# Model Training 

In [92]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

# traning 
model= RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model.fit(x_train, y_train)



RandomForestClassifier(max_depth=2, random_state=0)

In [93]:
# testing 
y_pred= model.predict(x_test)
f1_score(y_test, y_pred)

0.6880072137060416

In [94]:
# lets check accuracy score
accuracy_score(y_test, y_pred)         


0.654

In [99]:
y_pred[0]  # this is positive tweet  ; 0-negative, 1-postive

1

In [101]:
y_pred[100]  # this is negative tweet

0

In [109]:
y_pred[:10] 

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [108]:
y_test[:10]

807808    1
804702    1
808693    1
809085    1
6689      0
4093      0
2799      0
809439    1
7783      0
933       0
Name: Label, dtype: int64

In [None]:
# Note- I have just taken very small amount of data for minimizing the computational power. 
# One can use the entire dataset to and train it to increase the accuracy; can even chose a differnt 
# classifier algorithms