In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
# nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Working with text

In [2]:
twitter_data = pd.read_csv("data/text/twitter_training.csv", header = None)
twitter_data.columns = ["tweet_id","entity","sentiment","content"]

In [3]:
twitter_data.describe()

Unnamed: 0,tweet_id
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [4]:
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   74682 non-null  int64 
 1   entity     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   content    73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
twitter_data.tweet_id.unique()

array([2401, 2402, 2403, ..., 9198, 9199, 9200], shape=(12447,))

In [6]:
twitter_data.drop_duplicates()

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [7]:
with open("data/text/twitter_training.csv", "r") as file:
    print(file.read()[:1000])

2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
2401,Borderlands,Positive,"I am coming to the borders and I will kill you all,"
2401,Borderlands,Positive,"im getting on borderlands and i will kill you all,"
2401,Borderlands,Positive,"im coming on borderlands and i will murder you all,"
2401,Borderlands,Positive,"im getting on borderlands 2 and i will murder you me all,"
2401,Borderlands,Positive,"im getting into borderlands and i can murder you all,"
2402,Borderlands,Positive,So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg
2402,Borderlands,Positive,"So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a 

In [8]:
twitter_data.content.str.len().sort_values(ascending = False)

70940    957.0
28994    727.0
18128    692.0
3098     692.0
36308    692.0
         ...  
73972      NaN
73973      NaN
74421      NaN
74422      NaN
74423      NaN
Name: content, Length: 74682, dtype: float64

In [9]:
twitter_data.loc[70940, "content"]

'The event dedicated to Victory Day in the Great Patriotic War was held as part of the celebration of the 70th anniversary of Victory in the Great Patriotic War of 1941-1945, which was attended by veterans of the Great Patriotic War, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home front workers, home'

In [10]:
twitter_data.loc[73972, "content"]

nan

In [11]:
twitter_data = twitter_data.dropna()

In [12]:
twitter_data.content.str.len().mean()

np.float64(108.78365046759285)

In [13]:
twitter_data.content.str.len().median()

np.float64(91.0)

In [14]:
for sentiment, group_data in twitter_data.groupby("sentiment"):
    print(sentiment, group_data.content.str.len().median())

Irrelevant 93.0
Negative 91.0
Neutral 105.0
Positive 74.0


In [15]:
twitter_data.loc[0]

tweet_id                                                  2401
entity                                             Borderlands
sentiment                                             Positive
content      im getting on borderlands and i will murder yo...
Name: 0, dtype: object

In [16]:
Counter(twitter_data.loc[0].content)

Counter({' ': 10,
         'l': 5,
         'i': 4,
         'n': 4,
         'r': 4,
         'd': 4,
         'e': 3,
         'o': 3,
         'a': 3,
         'm': 2,
         'g': 2,
         't': 2,
         'u': 2,
         'b': 1,
         's': 1,
         'w': 1,
         'y': 1,
         ',': 1})

In [17]:
all_text = " ".join(twitter_data.content.ravel())

  all_text = " ".join(twitter_data.content.ravel())


In [18]:
Counter(all_text.lower())

Counter({' ': 1435996,
         'e': 673351,
         't': 536088,
         'o': 500274,
         'a': 499912,
         'i': 435070,
         'n': 405716,
         's': 388728,
         'r': 334275,
         'l': 270617,
         'h': 264166,
         'd': 222901,
         'c': 190216,
         'm': 188691,
         '.': 181075,
         'u': 181008,
         'g': 165566,
         'y': 153245,
         'p': 142483,
         'f': 132779,
         'w': 128290,
         'b': 114378,
         'k': 80140,
         'v': 76885,
         '/': 34279,
         ',': 32037,
         '!': 29426,
         '@': 28914,
         'x': 24664,
         '2': 22155,
         "'": 21852,
         '0': 21412,
         'j': 21285,
         'z': 16576,
         '1': 15421,
         '-': 14175,
         '?': 13284,
         ':': 10528,
         '3': 9948,
         '’': 9342,
         '5': 9092,
         '4': 8177,
         'q': 7180,
         '6': 6845,
         '7': 6486,
         '9': 6074,
         '"': 5489,

In [19]:
words = all_text.split(" ")

In [20]:
Counter(words)

Counter({'the': 38242,
         'to': 27711,
         'I': 25184,
         'and': 24646,
         'a': 21849,
         'of': 18621,
         'is': 16472,
         'for': 14708,
         '/': 14655,
         'in': 14081,
         '': 13526,
         '@': 11081,
         'on': 11068,
         'this': 10534,
         '.': 10193,
         'my': 10120,
         'it': 9649,
         'you': 9466,
         'that': 9023,
         'with': 8286,
         'have': 6250,
         'so': 5993,
         'be': 5721,
         'game': 5696,
         'just': 5679,
         'are': 5526,
         'but': 5524,
         'not': 5382,
         'me': 5360,
         'was': 5138,
         'The': 5005,
         'at': 4699,
         '-': 4650,
         'all': 4561,
         'like': 4318,
         'from': 4067,
         'your': 3786,
         'out': 3747,
         'i': 3731,
         'get': 3517,
         'has': 3514,
         'as': 3410,
         'about': 3292,
         '2': 3097,
         'an': 3050,
         '&': 3

In [21]:
stopwords.words("english")

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [22]:
twitter_data.content.str.split("\s+") # tokenization

  twitter_data.content.str.split("\s+") # tokenization


0        [im, getting, on, borderlands, and, i, will, m...
1        [I, am, coming, to, the, borders, and, I, will...
2        [im, getting, on, borderlands, and, i, will, k...
3        [im, coming, on, borderlands, and, i, will, mu...
4        [im, getting, on, borderlands, 2, and, i, will...
                               ...                        
74677    [Just, realized, that, the, Windows, partition...
74678    [Just, realized, that, my, Mac, window, partit...
74679    [Just, realized, the, windows, partition, of, ...
74680    [Just, realized, between, the, windows, partit...
74681    [Just, like, the, windows, partition, of, my, ...
Name: content, Length: 73996, dtype: object

In [23]:
twitter_data["words"] = twitter_data.content.str.split("\s+")

  twitter_data["words"] = twitter_data.content.str.split("\s+")


In [24]:
twitter_data.words.apply(lambda x: len(x)).sort_values(ascending = False)

1826     198
43712    198
32186    198
10454    198
68624    194
        ... 
30611      1
42245      1
42275      1
42215      1
9761       1
Name: words, Length: 73996, dtype: int64

In [26]:
twitter_data[twitter_data.content.str.contains("happy", case = False)].sentiment.value_counts()

sentiment
Positive      531
Neutral       281
Irrelevant    252
Negative      121
Name: count, dtype: int64

In [27]:
words

['im',
 'getting',
 'on',
 'borderlands',
 'and',
 'i',
 'will',
 'murder',
 'you',
 'all',
 ',',
 'I',
 'am',
 'coming',
 'to',
 'the',
 'borders',
 'and',
 'I',
 'will',
 'kill',
 'you',
 'all,',
 'im',
 'getting',
 'on',
 'borderlands',
 'and',
 'i',
 'will',
 'kill',
 'you',
 'all,',
 'im',
 'coming',
 'on',
 'borderlands',
 'and',
 'i',
 'will',
 'murder',
 'you',
 'all,',
 'im',
 'getting',
 'on',
 'borderlands',
 '2',
 'and',
 'i',
 'will',
 'murder',
 'you',
 'me',
 'all,',
 'im',
 'getting',
 'into',
 'borderlands',
 'and',
 'i',
 'can',
 'murder',
 'you',
 'all,',
 'So',
 'I',
 'spent',
 'a',
 'few',
 'hours',
 'making',
 'something',
 'for',
 'fun.',
 '.',
 '.',
 'If',
 'you',
 "don't",
 'know',
 'I',
 'am',
 'a',
 'HUGE',
 '@Borderlands',
 'fan',
 'and',
 'Maya',
 'is',
 'one',
 'of',
 'my',
 'favorite',
 'characters.',
 'So',
 'I',
 'decided',
 'to',
 'make',
 'myself',
 'a',
 'wallpaper',
 'for',
 'my',
 'PC.',
 '.',
 'Here',
 'is',
 'the',
 'original',
 'image',
 'versus

In [29]:
twitter_data.words.sample(20).apply(lambda words: [w.lower() for w in words if w not in stopwords.words("english")])

73684    [source, improving, valorant, hehe, (p.s, sorr...
52008    [great, list,, disappointed, one, even, mentio...
24694    [@nicolledwallace., this, stuff, really, disgu...
69189    [@cyberpunkgame., free, song, ., (bet, reply)....
9452     [at, time,, despite, fact, currently, 100, mil...
22541                                                   []
9182     [lets, go!!!, overwatch, good, vibes!!!, come,...
30791                                                   []
47145    [deal, of, the, day:, back, order, the, home, ...
47202    [anyways, guy, home, depot, winked, i’m, prett...
55934    [i, @, cipi, _, _, _, switch, finished, match,...
14768                                   [noob, indog, way]
3440     [sun:, rocking., -, co-stream, @, emgoood?, li...
25360    [everytime, @google, +, 2, sued,, i..., litera...
42362    [@, narendramodi, please, sir, request, please...
67958    [to, pass, time,, grab, exciting...., dark,, s...
70285                                     [literally, no

In [32]:
stem = PorterStemmer()

In [33]:
twitter_data.words.sample(20)

27717                       [Itching, will, assassinate..]
44157    [., ., ., @ImranKhanPTI., @ArifAlvi, /, @PTAof...
16467    [I, guess, we, haven’t, seen, the, price, of, ...
45592    [this, tour, is, lifeeeee, for, @aliciakeys, a...
5342     [@, CuriMax24, Thanks, for, entering, Grand, S...
21677    [TOP, Swedish, esports, card, Fnatic, just, si...
69584    ["Punk-2077", is, planning, to, release, an, a...
32688    [It, is, ABSOLUTELY, RIDICULOUS, that, prop, h...
54167    [Umm, @Activision, @CallofDuty, oh, @InfinityW...
51483    [Heads, up!, Going, to, be, streamed, live, to...
16788    [Well,, clear-transparent, panel, will, be, my...
31903    [I, don't, have, a, shortstack, problem,, I, c...
74613    [Nvidia, should, cancel, any, order, that, is,...
59496    [Facebook,, Twitter,, Google, threaten, to, su...
7622                  [I'll, be, on, air, in, 5, minutes.]
11258                             [About, fucking, time!!]
28186    [That, beam, pattern, was, disgusting.., more,.

In [51]:
twitter_data.words.loc[52]

['for',
 'Blaming',
 'Sight',
 'for',
 'Tardiness!',
 'A',
 'little',
 'bit',
 'of',
 'borderlands.',
 'I',
 'got',
 'called',
 'in',
 'early',
 'for',
 'work',
 'tomorrow',
 'so',
 'I',
 "can't",
 'make',
 'up',
 'time.',
 'Sorry',
 'my',
 'loves',
 '.',
 'twitch.tv/punnisenpai']

In [52]:
[stem.stem(w) for w in twitter_data.words.loc[52]]

['for',
 'blame',
 'sight',
 'for',
 'tardiness!',
 'a',
 'littl',
 'bit',
 'of',
 'borderlands.',
 'i',
 'got',
 'call',
 'in',
 'earli',
 'for',
 'work',
 'tomorrow',
 'so',
 'i',
 "can't",
 'make',
 'up',
 'time.',
 'sorri',
 'my',
 'love',
 '.',
 'twitch.tv/punnisenpai']

In [64]:
words_with_no_stopwords = twitter_data.words.apply(lambda words: [w.lower() for w in words if w not in stopwords.words("english")])


In [70]:
words_stammed_lower_no_stopwords = words_with_no_stopwords.apply(lambda words: [stem.stem(w) for w in words])

In [71]:
words_stammed_lower_no_stopwords

0                         [im, get, borderland, murder, ,]
1                         [i, come, border, i, kill, all,]
2                        [im, get, borderland, kill, all,]
3                     [im, come, borderland, murder, all,]
4                   [im, get, borderland, 2, murder, all,]
                               ...                        
74677    [just, realiz, window, partit, mac, like, 6, y...
74678    [just, realiz, mac, window, partit, 6, year, b...
74679    [just, realiz, window, partit, mac, 6, year, b...
74680    [just, realiz, window, partit, mac, like, 6, y...
74681    [just, like, window, partit, mac, like, 6, yea...
Name: words, Length: 73996, dtype: object

In [72]:
words_stammed_lower_no_stopwords.apply(lambda x: Counter(x))

0        {'im': 1, 'get': 1, 'borderland': 1, 'murder':...
1        {'i': 2, 'come': 1, 'border': 1, 'kill': 1, 'a...
2        {'im': 1, 'get': 1, 'borderland': 1, 'kill': 1...
3        {'im': 1, 'come': 1, 'borderland': 1, 'murder'...
4        {'im': 1, 'get': 1, 'borderland': 1, '2': 1, '...
                               ...                        
74677    {'just': 1, 'realiz': 1, 'window': 1, 'partit'...
74678    {'just': 1, 'realiz': 1, 'mac': 1, 'window': 1...
74679    {'just': 1, 'realiz': 1, 'window': 1, 'partit'...
74680    {'just': 1, 'realiz': 1, 'window': 1, 'partit'...
74681    {'just': 1, 'like': 2, 'window': 1, 'partit': ...
Name: words, Length: 73996, dtype: object

In [74]:
count_vectorizer = CountVectorizer()

In [76]:
count_vectorizer.fit(twitter_data.content)

In [78]:
count_vectorizer.vocabulary_

{'im': 14087,
 'getting': 11958,
 'on': 19552,
 'borderlands': 4660,
 'and': 2629,
 'will': 29921,
 'murder': 18421,
 'you': 30664,
 'all': 2439,
 'am': 2527,
 'coming': 6463,
 'to': 27558,
 'the': 27084,
 'borders': 4665,
 'kill': 15676,
 'me': 17490,
 'into': 14619,
 'can': 5317,
 'so': 25235,
 'spent': 25547,
 'few': 10681,
 'hours': 13706,
 'making': 17120,
 'something': 25331,
 'for': 11135,
 'fun': 11516,
 'if': 14010,
 'don': 8782,
 'know': 15795,
 'huge': 13779,
 'fan': 10409,
 'maya': 17432,
 'is': 14763,
 'one': 19561,
 'of': 19420,
 'my': 18498,
 'favorite': 10506,
 'characters': 5768,
 'decided': 7891,
 'make': 17112,
 'myself': 18520,
 'wallpaper': 29444,
 'pc': 20340,
 'here': 13309,
 'original': 19764,
 'image': 14090,
 'versus': 29058,
 'creation': 7176,
 'made': 17026,
 'enjoy': 9694,
 'pic': 20643,
 'twitter': 28190,
 'com': 6428,
 'mlsi5wf9jg': 18021,
 'couple': 7037,
 'doing': 8756,
 'that': 27072,
 'picture': 20656,
 'compared': 6534,
 'have': 13052,
 'rhandlerr': 

In [80]:
vectors = count_vectorizer.transform(twitter_data.content)

In [82]:
model = MultinomialNB()

In [85]:
model.fit(vectors, twitter_data.sentiment)

In [88]:
tfidf = TfidfVectorizer()

In [92]:
tfidf.fit(twitter_data.content)

In [93]:
tfidf.vocabulary_

{'im': 14087,
 'getting': 11958,
 'on': 19552,
 'borderlands': 4660,
 'and': 2629,
 'will': 29921,
 'murder': 18421,
 'you': 30664,
 'all': 2439,
 'am': 2527,
 'coming': 6463,
 'to': 27558,
 'the': 27084,
 'borders': 4665,
 'kill': 15676,
 'me': 17490,
 'into': 14619,
 'can': 5317,
 'so': 25235,
 'spent': 25547,
 'few': 10681,
 'hours': 13706,
 'making': 17120,
 'something': 25331,
 'for': 11135,
 'fun': 11516,
 'if': 14010,
 'don': 8782,
 'know': 15795,
 'huge': 13779,
 'fan': 10409,
 'maya': 17432,
 'is': 14763,
 'one': 19561,
 'of': 19420,
 'my': 18498,
 'favorite': 10506,
 'characters': 5768,
 'decided': 7891,
 'make': 17112,
 'myself': 18520,
 'wallpaper': 29444,
 'pc': 20340,
 'here': 13309,
 'original': 19764,
 'image': 14090,
 'versus': 29058,
 'creation': 7176,
 'made': 17026,
 'enjoy': 9694,
 'pic': 20643,
 'twitter': 28190,
 'com': 6428,
 'mlsi5wf9jg': 18021,
 'couple': 7037,
 'doing': 8756,
 'that': 27072,
 'picture': 20656,
 'compared': 6534,
 'have': 13052,
 'rhandlerr': 