# Twitter Sentiment Analysis



## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading and Viewing the dataset

### Load

In [2]:
colnames=['score', 'id', 'date', 'flag', 'user', 'tweet']
tweets_df = pd.read_csv("data/twitter.csv", encoding="latin-1", names=colnames)

### View

Top

In [3]:
tweets_df.head(20)

Unnamed: 0,score,id,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


Bottom

In [4]:
tweets_df.tail(20)

Unnamed: 0,score,id,date,flag,user,tweet
1599980,4,2193578679,Tue Jun 16 08:38:56 PDT 2009,NO_QUERY,puchal_ek,@myheartandmind jo jen by nemuselo zrovna tÃ© ...
1599981,4,2193578716,Tue Jun 16 08:38:57 PDT 2009,NO_QUERY,youtubelatest,Another Commenting Contest! [;: Yay!!! http:/...
1599982,4,2193578739,Tue Jun 16 08:38:57 PDT 2009,NO_QUERY,Mandi_Davenport,@thrillmesoon i figured out how to see my twee...
1599983,4,2193578758,Tue Jun 16 08:38:57 PDT 2009,NO_QUERY,xoAurixo,"@oxhot theri tomorrow, drinking coffee, talkin..."
1599984,4,2193578847,Tue Jun 16 08:38:57 PDT 2009,NO_QUERY,RobFoxKerr,You heard it here first -- We're having a girl...
1599985,4,2193578982,Tue Jun 16 08:38:58 PDT 2009,NO_QUERY,LISKFEST,"if ur the lead singer in a band, beware fallin..."
1599986,4,2193579087,Tue Jun 16 08:38:58 PDT 2009,NO_QUERY,marhgil,@tarayqueen too much ads on my blog.
1599987,4,2193579092,Tue Jun 16 08:38:58 PDT 2009,NO_QUERY,cathriiin,@La_r_a NEVEER I think that you both will get...
1599988,4,2193579191,Tue Jun 16 08:38:59 PDT 2009,NO_QUERY,tellman,@Roy_Everitt ha- good job. that's right - we g...
1599989,4,2193579211,Tue Jun 16 08:38:59 PDT 2009,NO_QUERY,jazzstixx,@Ms_Hip_Hop im glad ur doing well


In [5]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   score   1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   tweet   1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [6]:
tweets_df.describe()

Unnamed: 0,score,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [7]:
tweets_df.isnull().sum()

score    0
id       0
date     0
flag     0
user     0
tweet    0
dtype: int64

## Pre-Processing the data

We need to select which columns are more useful to get the results. After analysing the data, it was decided that the following columns are going to be used: **score** and **tweet**.

In [8]:
tweets_df = tweets_df.drop(['id', 'date', 'flag', 'user'], axis=1)
tweets_df = tweets_df[['tweet', 'score']]

In [10]:
tweets_df.head(20)

Unnamed: 0,tweet,score
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@Kwesidei not the whole crew,0
6,Need a hug,0
7,@LOLTrish hey long time no see! Yes.. Rains a...,0
8,@Tatiana_K nope they didn't have it,0
9,@twittera que me muera ?,0


In [11]:
tweets_df.tail(20)

Unnamed: 0,tweet,score
1599980,@myheartandmind jo jen by nemuselo zrovna tÃ© ...,4
1599981,Another Commenting Contest! [;: Yay!!! http:/...,4
1599982,@thrillmesoon i figured out how to see my twee...,4
1599983,"@oxhot theri tomorrow, drinking coffee, talkin...",4
1599984,You heard it here first -- We're having a girl...,4
1599985,"if ur the lead singer in a band, beware fallin...",4
1599986,@tarayqueen too much ads on my blog.,4
1599987,@La_r_a NEVEER I think that you both will get...,4
1599988,@Roy_Everitt ha- good job. that's right - we g...,4
1599989,@Ms_Hip_Hop im glad ur doing well,4


## Visualizing the data

In [12]:
positive = tweets_df[tweets_df['score'] == 0]
neutral = tweets_df[tweets_df['score'] == 2]
negative = tweets_df[tweets_df['score'] == 4]

### WordCloud

In [13]:
sentences = tweets_df['tweet'].tolist()
sentences_as_one_string = " ".join(sentences)

In [11]:
from wordcloud import WordCloud

plt.figure(figsize=(20,30))
plt.imshow((WordCloud().generate(sentences_as_one_string)))

### Countplot

In [18]:
sns.countplot(tweets_df['score'], label = "Count")

## Cleaning the data

### Punctuation

In [14]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Stopwords

In [15]:
import nltk # Natural Language tool kit 
nltk.download('stopwords')

# You have to download stopwords Package to execute this command
from nltk.corpus import stopwords
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mobanju/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Pipeline for removing the punctuation and stopwords

In [16]:
def tweet_cleaning(tweet):
    tweet_punc_removed = [char for char in tweet if char not in string.punctuation]
    tweet_punc_removed_join = "".join(tweet_punc_removed)
    
    tweet_cleaned = [word for word in tweet_punc_removed_join.split() if word.lower() not in stopwords.words('english')]

    return tweet_cleaned

In [17]:
tweets_df_clean = tweets_df["tweet"].apply(tweet_cleaning)

In [18]:
print(tweets_df_clean[5])

['Kwesidei', 'whole', 'crew']


### Performing Count Vectorization

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = tweet_cleaning, dtype = np.uint8)
tweets_countvectorizer = vectorizer.fit_transform(tweets_df['tweet'])

In [20]:
print(vectorizer.get_feature_names_out())

['0' '00' '000' ... 'ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½'
 'ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½2'
 'ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½xxxxxxxxx2123']


In [29]:
#vectorizer = CountVectorizer(analyzer=tweet_cleaning, dtype=np.uint8, max_features=500)
#tweets_countvectorizer = vectorizer.fit_transform(tweets_df['tweet'])
print(tweets_countvectorizer.toarray())

MemoryError: Unable to allocate 1.41 TiB for an array with shape (1600000, 969103) and data type uint8

In [27]:
tweets_countvectorizer.shape

(1600000, 969103)

In [26]:
desired_memory_size = 1e+3

X = np.empty((1600000, int(desired_memory_size)), dtype=np.uint8)


In [28]:
y = tweets_df['score']