In [1]:
!pip install nltk



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

import os
os.chdir('gdrive/My Drive/NNDL - Spring 00/Mini Project 2')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd

## Part 1. Resolving the Unbalancedness

In [4]:
dataset = pd.read_csv('sentiment_final.csv')
print(dataset['sentiment'].value_counts())

Negative    8493
Positive    2236
Name: sentiment, dtype: int64


In [5]:
dataset_copy = dataset[dataset['sentiment']=='Positive']
print(dataset_copy)

                                                    text sentiment
0      RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
1      RT @RobGeorge: That Carly Fiorina is trending ...  Positive
2      RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
3      RT @GregAbbott_TX: @TedCruz: "On my first day ...  Positive
8      RT @WayneDupreeShow: Just woke up to tweet thi...  Positive
...                                                  ...       ...
10717  Best line of #GOPDebate was "Immigration witho...  Positive
10720  RT @RWSurferGirl: Trump has got it right, nobo...  Positive
10725  RT @georgehenryw: Who thought Huckabee exceede...  Positive
10726  RT @Lrihendry: #TedCruz As President, I will a...  Positive
10728  RT @Lrihendry: #TedCruz headed into the Presid...  Positive

[2236 rows x 2 columns]


In [6]:
os_list = []
for i in range(len(dataset_copy)):
  inst_loc = np.random.randint(len(dataset_copy), size=1)
  os_list.append([dataset_copy['text'].iloc[inst_loc[0]], dataset_copy['sentiment'].iloc[inst_loc[0]]])

os_list = pd.DataFrame(os_list, columns=['text', 'sentiment'])

In [7]:
dataset_neg_us = dataset[dataset['sentiment']=='Negative']
print(dataset_neg_us)

                                                    text sentiment
4      RT @warriorwoman91: I liked her and was happy ...  Negative
5      Deer in the headlights RT @lizzwinstead: Ben C...  Negative
6      RT @NancyOsborne180: Last night's debate prove...  Negative
7      @JGreenDC @realDonaldTrump In all fairness #Bi...  Negative
9      Me reading my family's comments about how grea...  Negative
...                                                  ...       ...
10721  So trans soldiers can die for you Huckabee but...  Negative
10722  RT @RWSurferGirl: Is it just me or does anyone...  Negative
10723  RT @RWSurferGirl: Fox is cherry picking the ca...  Negative
10724  RT @cappy_yarbrough: Love to see men who will ...  Negative
10727  RT @JRehling: #GOPDebate Donald Trump says tha...  Negative

[8493 rows x 2 columns]


In [8]:
for i in range(int(0.5*len(dataset_neg_us))):
  inst_loc = np.random.randint(len(dataset_neg_us), size=1)
  dataset_neg_us.drop(index=dataset_neg_us.index[inst_loc[0]], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
dataset_copy = dataset_copy.append(os_list)
dataset_copy = dataset_copy.append(dataset_neg_us)

In [10]:
print(dataset_copy)

                                                    text sentiment
0      RT @ScottWalker: Didn't catch the full #GOPdeb...  Positive
1      RT @RobGeorge: That Carly Fiorina is trending ...  Positive
2      RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  Positive
3      RT @GregAbbott_TX: @TedCruz: "On my first day ...  Positive
8      RT @WayneDupreeShow: Just woke up to tweet thi...  Positive
...                                                  ...       ...
10713  while pro-life nonsense @ the #GOPDebate was n...  Negative
10714  GOP respects life....just not black ones. #GOP...  Negative
10719  RT @RWSurferGirl: Why should @realDonaldTrump ...  Negative
10721  So trans soldiers can die for you Huckabee but...  Negative
10724  RT @cappy_yarbrough: Love to see men who will ...  Negative

[8719 rows x 2 columns]


## Part 2. Pre-processing

In [11]:
import nltk
import string
import re

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def tokenize(sentences, stop_words):
  lowercases = sentences.lower() #setting all words to lowercase
  nopunc = "".join([word for word in lowercases if word not in string.punctuation]) #Removing punctuations
  alphanum = re.sub('\W+', ' ', nopunc) #Removing non-alphaumeric characters
  tokens = nltk.word_tokenize(alphanum) # tokenizing
  tokens = [word for word in tokens if word not in stop_words] #Removing stopwords
  tokens = [word for word in tokens if '@' not in word] # Removing the '@'s
  tokens = [word for word in tokens if 'http' not in word] #Removing URLs
  tokens = [word for word in tokens if word not in ['rt', 'gopdebate'] and len(word)>1]

  return tokens

In [13]:
 dataset_copy['tokens'] = dataset_copy['text'].map(lambda x: tokenize(x,stop_words))
 clean_dataset = dataset_copy.loc[dataset_copy.tokens.map(lambda x: len(x)>0),['text', 'sentiment', 'tokens']]

In [14]:
print(clean_dataset)

                                                    text  ...                                             tokens
0      RT @ScottWalker: Didn't catch the full #GOPdeb...  ...  [scottwalker, didnt, catch, full, last, night,...
1      RT @RobGeorge: That Carly Fiorina is trending ...  ...  [robgeorge, carly, fiorina, trending, hours, d...
2      RT @DanScavino: #GOPDebate w/ @realDonaldTrump...  ...  [danscavino, realdonaldtrump, delivered, highe...
3      RT @GregAbbott_TX: @TedCruz: "On my first day ...  ...  [gregabbotttx, tedcruz, first, day, rescind, e...
8      RT @WayneDupreeShow: Just woke up to tweet thi...  ...  [waynedupreeshow, woke, tweet, best, line, nig...
...                                                  ...  ...                                                ...
10713  while pro-life nonsense @ the #GOPDebate was n...  ...  [prolife, nonsense, amusing, notion, jizzisape...
10714  GOP respects life....just not black ones. #GOP...  ...  [gop, respects, lifejust, black, 

In [15]:
clean_dataset['sentences'] = clean_dataset.tokens.map(lambda x: " ".join(word for word in x))
print(clean_dataset['sentences'])

0        scottwalker didnt catch full last night scotts...
1        robgeorge carly fiorina trending hours debate ...
2        danscavino realdonaldtrump delivered highest r...
3        gregabbotttx tedcruz first day rescind every i...
8        waynedupreeshow woke tweet best line night via...
                               ...                        
10713    prolife nonsense amusing notion jizzisaperson ...
10714          gop respects lifejust black ones gopdebates
10719    rwsurfergirl realdonaldtrump pledge support go...
10721    trans soldiers die huckabee cant foot bill mak...
10724    cappyyarbrough love see men never faced pregna...
Name: sentences, Length: 8719, dtype: object


In [16]:
print(clean_dataset['sentences'].iloc[0])

scottwalker didnt catch full last night scotts best lines 90 seconds walker16


## Part 3. Using Word Embedding

In [17]:
clean_dataset['sentiment'] = clean_dataset['sentiment'].astype('category')
clean_dataset['sentiment_enc'] = clean_dataset['sentiment'].cat.codes
clean_dataset.head()

Unnamed: 0,text,sentiment,tokens,sentences,sentiment_enc
0,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive,"[scottwalker, didnt, catch, full, last, night,...",scottwalker didnt catch full last night scotts...,1
1,RT @RobGeorge: That Carly Fiorina is trending ...,Positive,"[robgeorge, carly, fiorina, trending, hours, d...",robgeorge carly fiorina trending hours debate ...,1
2,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive,"[danscavino, realdonaldtrump, delivered, highe...",danscavino realdonaldtrump delivered highest r...,1
3,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive,"[gregabbotttx, tedcruz, first, day, rescind, e...",gregabbotttx tedcruz first day rescind every i...,1
8,RT @WayneDupreeShow: Just woke up to tweet thi...,Positive,"[waynedupreeshow, woke, tweet, best, line, nig...",waynedupreeshow woke tweet best line night via...,1


In [31]:
def pad_words(review, seq_len):
  review_len = len(review)
  if review_len <= seq_len:
    return np.append(review, seq_len-review_len *np.zeros((64,64)))
  
  elif review_len < seq_len:
    return review[0:seq_len]
  

In [32]:
clean_dataset['padvec'] = clean_dataset['tokens'].map(lambda x: pad_words(x, 10))

In [33]:
from gensim.models import Word2Vec
tokenized_docs = clean_dataset['tokens']
model = Word2Vec(sentences=tokenized_docs, size=64, seed=42, workers=1)

In [34]:
def vectorize(docs, model):

  zerovec = np.zeros(model.vector_size)
  sentence_words = []
  for word in docs:
    if word in model.wv:
      try:
        sentence_words.append(model.wv[word])
      except KeyError:
        continue
    
  if sentence_words:
    return sentence_words
  else:
    return zerovec

In [None]:
clean_dataset['vectors'] = clean_dataset['padvec'].map(lambda x: vectorize(x, model))

In [35]:
print(clean_dataset.padvec.iloc[0])

None


In [26]:
print(len(clean_dataset.padvec.iloc[0]))

TypeError: ignored