***Step 1: Preprocessing***

In [51]:
# Import libraries necessary for this step
import numpy as np
import pandas as pd
import json
import unicodedata
import sys
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [52]:
# Open the json file and save the data line by line in a list
data = []
with open("controversial-comments.jsonl", 'r') as f:
    for line in f:
       data.append(json.loads(line))

In [53]:
# Convert the list into a dataframe 
df = pd.DataFrame(data)
df.head()

Unnamed: 0,con,txt
0,0,Well it's great that he did something about th...
1,0,You are right Mr. President.
2,0,You have given no input apart from saying I am...
3,0,I get the frustration but the reason they want...
4,0,I am far from an expert on TPP and I would ten...


In [64]:
# Convert all text to lowercase
df['txt'] = df['txt'].str.lower()
df.head()

Unnamed: 0,con,txt
0,0,well it's great that he did something about th...
1,0,you are right mr. president.
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


In [65]:
# Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode)
                            if unicodedata.category(chr(i)).startswith('P'))

In [66]:
# Use the punctuation dictionary to remove all punctuation from text column
df['txt'] = [string.translate(punctuation) for string in df['txt']]
df.head()

Unnamed: 0,con,txt
0,0,well its great that he did something about tho...
1,0,you are right mr president
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...


In [28]:
# Due to an error I received when trying to tokenize words I had to download punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\myraw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [67]:
# To prepare for removing stopwords, I have to first tokenize the data
df['txt_tokenized'] = df['txt'].apply(word_tokenize)
df.head()

Unnamed: 0,con,txt,txt_tokenized
0,0,well its great that he did something about tho...,"[well, its, great, that, he, did, something, a..."
1,0,you are right mr president,"[you, are, right, mr, president]"
2,0,you have given no input apart from saying i am...,"[you, have, given, no, input, apart, from, say..."
3,0,i get the frustration but the reason they want...,"[i, get, the, frustration, but, the, reason, t..."
4,0,i am far from an expert on tpp and i would ten...,"[i, am, far, from, an, expert, on, tpp, and, i..."


In [31]:
# Now I have to download the set of stopwords since it is my first time using it
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\myraw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [68]:
# Load stopwords
stop_words = stopwords.words('english')

In [69]:
# Remove stop words
df['txt_tokenized'] = df['txt_tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
df.head()

Unnamed: 0,con,txt,txt_tokenized
0,0,well its great that he did something about tho...,"[well, great, something, beliefs, office, doub..."
1,0,you are right mr president,"[right, mr, president]"
2,0,you have given no input apart from saying i am...,"[given, input, apart, saying, wrong, argument,..."
3,0,i get the frustration but the reason they want...,"[get, frustration, reason, want, way, foundati..."
4,0,i am far from an expert on tpp and i would ten...,"[far, expert, tpp, would, tend, agree, lot, pr..."


In [70]:
# To apply NLTK’s PorterStemmer, I must first create a stemmer
porter = PorterStemmer()

In [71]:
# Apply the stemmer to the tokenized text
df['txt_tokenized'] = df['txt_tokenized'].apply(lambda x: [porter.stem(word) for word in x])
df.head()

Unnamed: 0,con,txt,txt_tokenized
0,0,well its great that he did something about tho...,"[well, great, someth, belief, offic, doubt, tr..."
1,0,you are right mr president,"[right, mr, presid]"
2,0,you have given no input apart from saying i am...,"[given, input, apart, say, wrong, argument, cl..."
3,0,i get the frustration but the reason they want...,"[get, frustrat, reason, want, way, foundat, co..."
4,0,i am far from an expert on tpp and i would ten...,"[far, expert, tpp, would, tend, agre, lot, pro..."


In [20]:
# Save the preprocessed dataframe to a csv file for easy recall
df.to_csv('wk2_clean.csv')

***Step 2A: Word Count Vector***

In [1]:
# Import libraries necessary for this step
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load up preprocessed data
test = pd.read_csv('wk2_clean.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,con,txt,txt_tokenized
0,0,0,well its great that he did something about tho...,"['well', 'great', 'someth', 'belief', 'offic',..."
1,1,0,you are right mr president,"['right', 'mr', 'presid']"
2,2,0,you have given no input apart from saying i am...,"['given', 'input', 'apart', 'say', 'wrong', 'a..."
3,3,0,i get the frustration but the reason they want...,"['get', 'frustrat', 'reason', 'want', 'way', '..."
4,4,0,i am far from an expert on tpp and i would ten...,"['far', 'expert', 'tpp', 'would', 'tend', 'agr..."


In [3]:
# Create count vectorizer function
count = CountVectorizer()

In [4]:
# Change the tokenized text column into a string
test['txt_tokenized'] = test['txt_tokenized'].astype('str')

In [5]:
# Take a sample of rows
test_df = test.sample(frac = 0.2)

In [6]:
# Call wordCount function on the tokenized text
bag_of_words = count.fit_transform(test_df['txt_tokenized'])
bag_of_words

<190000x74709 sparse matrix of type '<class 'numpy.int64'>'
	with 3079001 stored elements in Compressed Sparse Row format>

In [9]:
# Below is from a previous run through were I only sampled 5 rows, so I could make sure everything worked correctly
# by displaying the dense matrix before processing a larger number of sample rows. 
arr = bag_of_words.toarray()
arr

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [11]:
# Also from the 5 row sample: Change into a pandas dataframe to view the pretty version with the column names
pd.DataFrame(arr, columns=count.get_feature_names())

Unnamed: 0,allow,candid,citizen,countri,dumb,go,last,legal,link,millennia,...,remov,republican,say,someth,these,trump,two,want,whole,wors
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,1,1,0,...,0,0,1,1,0,1,0,1,0,0
3,1,1,0,1,1,0,1,0,0,0,...,0,0,0,0,1,0,1,0,1,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


***Step 2B: Part of Speech Tag Vector***

In [49]:
# Import libraries necessary for this step
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer

In [18]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\myraw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [27]:
# Load up preprocessed data
test = pd.read_csv('wk2_clean.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,con,txt,txt_tokenized
0,0,0,well its great that he did something about tho...,"['well', 'great', 'someth', 'belief', 'offic',..."
1,1,0,you are right mr president,"['right', 'mr', 'presid']"
2,2,0,you have given no input apart from saying i am...,"['given', 'input', 'apart', 'say', 'wrong', 'a..."
3,3,0,i get the frustration but the reason they want...,"['get', 'frustrat', 'reason', 'want', 'way', '..."
4,4,0,i am far from an expert on tpp and i would ten...,"['far', 'expert', 'tpp', 'would', 'tend', 'agr..."


In [56]:
# Take a sample of rows
test_df = test.sample(frac = 0.000005)

In [4]:
# Use pretrained parts of speech tagger
text_tagged = pos_tag(test_df['txt_tokenized'])
text_tagged

[("['hit', 'nail', 'head', 'girlfriend', 'tell', 'dad', 'trump', 'support', 'peopl', 'give', 'cabinet', 'posit', 'totransit', 'team', 'posit', 'big', 'swamp', 'list', 'essenti', 'told', 'bias', 'news', 'cant', 'argu', 'anyth', 'someon', 'doesnt', 'like', 'untrustworthi', 'never', 'mind', 'opinion', 'even', 'given', 'list', 'peopl', 'appoint', 'jobscredentialsexperi']",
  'JJ'),
 ("['cnn', 'say', 'air', 'right', 'thousand', 'email', 'word', 'thousand', 'email']",
  'NNP'),
 ("['exactli']", 'NNP'),
 ("['see', 'comment', 'httpwwwredditcomrpoliticscomments5bxqrpdistrictvotersoverwhelminglyapprovereferendumd9sk707', 'thorough', 'argument', 'care', 'simpli', 'put', 'character', 'issu', 'wrong', 'dc', 'resid', 'arent', 'demand', 'vote', 'power', 'demand', 'recognit', 'right', 'note', 'power', 'pay', 'feder', 'tax', 'like', 'everi', 'american', 'dont', 'get', 'vote', 'power', 'congress', 'represent', 'senat', 'didnt', 'get', 'vote', 'presid', 'sixti', 'arent', 'even', 'grant', 'right', 'govern

In [5]:
# Not sure why that didn't tag each word in the row, so I'm going to try this tip from a classmate.
# Create function for the tagger
def tokenizer(arr):
    tokens = [pos_tag(word_tokenize(str(i))) for i in arr.split()]
    return tokens

In [32]:
# Create a list
tagged_text = []

In [33]:
# Use a for loop to tag each word by row
for row in test_df.iterrows():
    tags = test_df['txt_tokenized'].apply(tokenizer)
    tagged_text.append(tags)

In [34]:
tagged_text

[56775     [[([, NN), ('hit, ''), (', ''), (,, ,)], [('na...
 713370    [[([, JJ), ('cnn, NNP), (', POS), (,, ,)], [('...
 671952      [[([, JJ), ('exactli, NNP), (', POS), (], NN)]]
 294102    [[([, JJ), ('see, NNP), (', POS), (,, ,)], [('...
 947269    [[([, JJ), ('everi, NNP), (', POS), (,, ,)], [...
 Name: txt_tokenized, dtype: object,
 56775     [[([, NN), ('hit, ''), (', ''), (,, ,)], [('na...
 713370    [[([, JJ), ('cnn, NNP), (', POS), (,, ,)], [('...
 671952      [[([, JJ), ('exactli, NNP), (', POS), (], NN)]]
 294102    [[([, JJ), ('see, NNP), (', POS), (,, ,)], [('...
 947269    [[([, JJ), ('everi, NNP), (', POS), (,, ,)], [...
 Name: txt_tokenized, dtype: object,
 56775     [[([, NN), ('hit, ''), (', ''), (,, ,)], [('na...
 713370    [[([, JJ), ('cnn, NNP), (', POS), (,, ,)], [('...
 671952      [[([, JJ), ('exactli, NNP), (', POS), (], NN)]]
 294102    [[([, JJ), ('see, NNP), (', POS), (,, ,)], [('...
 947269    [[([, JJ), ('everi, NNP), (', POS), (,, ,)], [...
 Name: txt_

In [12]:
# I don't know why that assigns so many empty spaces and why it repeats five times. 
# I tried a different method, but that ended up breaking it out by letter.
x = test_df['txt_tokenized'].apply(lambda x: [pos_tag(word) for word in x])
x

56775     [[([, NN)], [(', '')], [(h, NN)], [(i, NN)], [...
713370    [[([, NN)], [(', '')], [(c, NNS)], [(n, NN)], ...
671952    [[([, NN)], [(', '')], [(e, NN)], [(x, NN)], [...
294102    [[([, NN)], [(', '')], [(s, NN)], [(e, NN)], [...
947269    [[([, NN)], [(', '')], [(e, NN)], [(v, NN)], [...
Name: txt_tokenized, dtype: object

In [None]:
# Here's another way i tried that also broke it down to letters
test_df['POS'] = [pos_tag(sent) for sent in test_df['txt_tokenized']]

In [43]:
# view the new column
test_df

Unnamed: 0.1,Unnamed: 0,con,txt,txt_tokenized,POS
549694,549694,0,this would absolutely not shock me,"['would', 'absolut', 'shock']","[([, NN), (', ''), (w, JJ), (o, IN), (u, JJ), ..."
165376,165376,0,because i got in an argument with you months a...,"['got', 'argument', 'month', 'ago', 'pretend',...","[([, NN), (', ''), (g, JJ), (o, IN), (t, NN), ..."
338095,338095,0,politics by definition is the shift of power,"['polit', 'definit', 'shift', 'power']","[([, NN), (', ''), (p, JJ), (o, IN), (l, NN), ..."
51031,51031,0,didnt the left basically tell joe the plumber ...,"['didnt', 'left', 'basic', 'tell', 'joe', 'plu...","[([, NN), (', ''), (d, NN), (i, NN), (d, VBP),..."
373650,373650,0,im not calling him a white supremacist merely ...,"['im', 'call', 'white', 'supremacist', 'mere',...","[([, NN), (', ''), (i, JJ), (m, NN), (', ''), ..."


In [57]:
# After 8 hours and 8,000 failed attempts at this, i read a post on stack overflow that said to use pos_tag_sents
# and I liked how they solved their problem, so I copied their code and altered it to fit my assignment
texts = test_df['txt_tokenized'].tolist()
tagged_texts = pos_tag_sents(map(word_tokenize, texts))
tagged_texts

[[('[', 'RB'),
  ("'like", 'MD'),
  ("'", "''"),
  (',', ','),
  ("'bush", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'obama", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'war", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'wage", 'NN'),
  ("'", "''"),
  (',', ','),
  ("'account", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'budget", 'NNP'),
  ("'", 'POS'),
  (']', 'NN')],
 [('[', 'RB'),
  ("'yike", 'MD'),
  ("'", "''"),
  (',', ','),
  ("'that", 'WP'),
  ("'", "''"),
  (',', ','),
  ("'question", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'im", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'afraid", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'your", "''"),
  ("'", "''"),
  (',', ','),
  ("'beyond", "''"),
  ("'", "''"),
  (',', ','),
  ("'help", "''"),
  ("'", 'POS'),
  (']', 'NN')],
 [('[', 'NN'),
  ("'word", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'mean", "''"),
  ("'", "''"),
  (',', ','),
  ("'staff", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  ("'member", 'NNP'),
  ("'", 'POS'),
  (',', ','),
  

In [58]:
# That of course tagged all the commas and apostrophes, so I'm just going to abort using the preprocessed text and 
# try it out with the original text.
texts = test_df['txt'].tolist()
tagged_texts = pos_tag_sents(map(word_tokenize, texts))
tagged_texts

[[('like', 'IN'),
  ('bush', 'NN'),
  ('did', 'VBD'),
  ('to', 'TO'),
  ('obama', 'VB'),
  ('with', 'IN'),
  ('wars', 'NNS'),
  ('being', 'VBG'),
  ('waged', 'VBD'),
  ('that', 'WDT'),
  ('were', 'VBD'),
  ('not', 'RB'),
  ('being', 'VBG'),
  ('accounted', 'VBN'),
  ('for', 'IN'),
  ('in', 'IN'),
  ('the', 'DT'),
  ('budgets', 'NNS')],
 [('yikes', 'NNS'),
  ('if', 'IN'),
  ('thats', 'NNS'),
  ('your', 'PRP$'),
  ('question', 'NN'),
  ('then', 'RB'),
  ('im', 'VBZ'),
  ('afraid', 'JJ'),
  ('youre', 'NN'),
  ('beyond', 'IN'),
  ('help', 'NN')],
 [('they', 'PRP'),
  ('do', 'VBP'),
  ('and', 'CC'),
  ('these', 'DT'),
  ('words', 'NNS'),
  ('mean', 'VBP'),
  ('a', 'DT'),
  ('staff', 'NN'),
  ('member', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('campaign', 'NN'),
  ('was', 'VBD'),
  ('found', 'VBN'),
  ('guilty', 'JJ'),
  ('of', 'IN'),
  ('fraud', 'NN'),
  ('just', 'RB'),
  ('like', 'IN'),
  ('it', 'PRP'),
  ('says', 'VBZ')],
 [('clearly', 'RB'),
  ('you', 'PRP'),
  ('are', 'VBP'),
  ('not'

In [59]:
# Finally it worked (i know it's not on the preprocessed data, but by golly it finally worked, so I'm going to load 
# that info back into the dataframe. There must have been something with preprocessing, saving to a csv, then loading
# the data that disrupted the flow. I did notice that when I loaded the preprocessed data from the csv it had apostrophes
# that were not there before, but I couldn't keep running the preprocessing steps and using the data from that because it
# was eating up my time, especially since I kept making my kernel die and having to restart a lot.
test_df['POS'] = tagged_texts
test_df

Unnamed: 0.1,Unnamed: 0,con,txt,txt_tokenized,POS
17195,17195,0,like bush did to obama with wars being waged t...,"['like', 'bush', 'obama', 'war', 'wage', 'acco...","[(like, IN), (bush, NN), (did, VBD), (to, TO),..."
216649,216649,0,yikes if thats your question then im afraid yo...,"['yike', 'that', 'question', 'im', 'afraid', '...","[(yikes, NNS), (if, IN), (thats, NNS), (your, ..."
2812,2812,0,they do and these words mean a staff member of...,"['word', 'mean', 'staff', 'member', 'campaign'...","[(they, PRP), (do, VBP), (and, CC), (these, DT..."
345482,345482,0,clearly you are not an understander of the sys...,"['clearli', 'understand', 'system', 'that', 'e...","[(clearly, RB), (you, PRP), (are, VBP), (not, ..."
947835,947835,0,good points but lets dispel with this fiction ...,"['good', 'point', 'let', 'dispel', 'fiction', ...","[(good, JJ), (points, NNS), (but, CC), (lets, ..."


***Step 2C: Term frequency-inverse document frequency (tfidf) vector***

In [8]:
# Load necessary libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Load up preprocessed data
test = pd.read_csv('wk2_clean.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,con,txt,txt_tokenized
0,0,0,well its great that he did something about tho...,"['well', 'great', 'someth', 'belief', 'offic',..."
1,1,0,you are right mr president,"['right', 'mr', 'presid']"
2,2,0,you have given no input apart from saying i am...,"['given', 'input', 'apart', 'say', 'wrong', 'a..."
3,3,0,i get the frustration but the reason they want...,"['get', 'frustrat', 'reason', 'want', 'way', '..."
4,4,0,i am far from an expert on tpp and i would ten...,"['far', 'expert', 'tpp', 'would', 'tend', 'agr..."


In [20]:
# Take a sample of rows to work with
tf = test.sample(frac = 0.2)

In [21]:
# Using the small sample that, I will run the Tfid Vectorizer to create a tf-idf feature matrix.
v = TfidfVectorizer()
x = v.fit_transform(tf['txt_tokenized'])

In [22]:
# Show the tf-idf feature matrix
x

<190000x74557 sparse matrix of type '<class 'numpy.float64'>'
	with 3086740 stored elements in Compressed Sparse Row format>

In [14]:
# Using the smaller sample of 5, I am able to show the tf-idf feature matrix as a dense matrix
x.toarray()

array([[0.37796447, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.75592895, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37796447, 0.        ],
       [0.        , 0.47140452, 0.23570226, 0.23570226, 0.        ,
        0.23570226, 0.23570226, 0.        , 0.        , 0.        ,
        0.        , 0.23570226, 0.23570226, 0.        , 0.23570226,
        0.        , 0.        , 0.        , 0.23570226, 0.23570226,
        0.23570226, 0.23570226, 0.23570226, 0.23570226, 0.        ,
        0.        , 0.        , 0.23570226, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.33333333,
        0.        , 0.    

***Follow Up Question***

For the three techniques in problem (2) above, give an example where each would be useful.

Word Count Vector: Gives us a numeric representation of text data that can then be used to categorize data or conduct clustering analysis on data. 

Part of Speech Tag Vector: Are useful in building parse trees and extracting relationships between words.

Tf-idf Vector: provides a way to associate each words importance in a document and is used in information retrieval or summarization.
