In [1]:
import codecs, nltk
import numpy as np
import pandas as pd

# Load dataset

In [86]:
df = pd.read_csv("Hyperpartisan meta/hyper_news_meta.csv")
#df = pd.read_csv("Fake news meta/Fake_news_meta.csv")
df.head(5)

Unnamed: 0,doc_id,title,mainText,hyperpartisan
0,0,Jerry Springer Just Summed Up Trump’s Debate W...,Millions of people tuned in Monday night to wa...,True
1,1,Clinton Campaign Charges College Students $500...,The Clintons understand the average American. ...,True
2,2,Migrant Reveals Sick Reason He Left…’to F*ck t...,Harassment is known in Arabic as ‘taharrush’. ...,True
3,3,Trump Said Obama Gave Iran “$150 Billion.” He ...,Democratic President Barack Obama pulled off a...,True
4,4,Hillary Accused Trump Of Calling Climate Chang...,Democratic nominee Hillary Clinton knows her f...,True


In [87]:
# Create new field that concatenates title and text
df['title_mainText'] = df['title'] + ' ' + df['mainText']

In [88]:
df.isnull().sum()

doc_id            0
title             0
mainText          0
hyperpartisan     0
title_mainText    0
dtype: int64

In [76]:
len(df)

2268

# Drop Missing data 

In [21]:
#df_no_missing =  df[pd.notnull(df['mainText'])]
#df_no_missing =  df[pd.notnull(df['title'])]
df_no_missing = df.dropna(subset=['mainText', 'title'])
df_no_missing = df_no_missing.reset_index(drop=True)

In [32]:
len(df_no_missing)

1590

In [33]:
df_no_missing.isnull().sum()

doc_id           0
author         466
title            0
mainText         0
portal           0
orientation      0
veracity         0
dtype: int64

# Label preprocessing

In [34]:
df_no_missing['veracity']=df_no_missing['veracity'].replace(['mixture of true and false','mostly false'], 'false')
#df_no_missing['veracity']=df_no_missing['veracity'].replace(['mostly false'], 'false')
df_no_missing['veracity']=df_no_missing['veracity'].replace(['mostly true'], 'true')

In [35]:
len(df_no_missing)

1590

In [36]:
df = df_no_missing[df_no_missing.veracity != 'no factual content']
#df = df[df.veracity != 'mixture of true and false']
df = df.reset_index(drop=True)

In [37]:
len(df)

1527

In [38]:
df['veracity'].value_counts()

true     1243
false     284
Name: veracity, dtype: int64

In [39]:
# Create new field that concatenates title and text
df['title_mainText'] = df['title'] + ' ' + df['mainText']

# Text Preprocessing Pipeline

## Gerneral Preprocessing 

In [89]:
import string
punctuation = list(string.punctuation)
punctuation.append("''")
punctuation.append("``")
from nltk.corpus import stopwords
stop_word_list = stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def nlp_pipeline_Alltext(text):
    
     
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # without punctuation
    text = [token for token in text if token not in punctuation]
    
    # pos tagger
    text = nltk.pos_tag(text)
    
    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    
    pre_proc_text =  " ".join([token for token in text])

    return pre_proc_text


## Extract only Noun/Verb 

In [93]:
import string
punctuation = list(string.punctuation)
punctuation.append("''")
punctuation.append("``")
from nltk.corpus import stopwords
stop_word_list = stopwords.words('english')
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


def nlp_pipeline(text):

    
    # word tokenizer
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)
    
    #extract nouns
    text = [token for token,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]

    #extract verbs
    #text = [token for token,pos in text if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')]
    
    # without punctuation
    text = [token for token in text if token not in punctuation]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    
    # lemmatizer for nouns
    text = [wordnet_lemmatizer.lemmatize(token.lower()) for token in text]
        
    # lemmatizer for verbs
    #text = [wordnet_lemmatizer.lemmatize(token.lower(),"v") for token in text]
    
    return text

In [91]:
df['title_preprocessed'] = df['title'].apply(nlp_pipeline_Alltext)
df['mainText_preprocessed'] = df['mainText'].apply(nlp_pipeline_Alltext)
df['title_mainText_preprocessed'] = df['title_mainText'].apply(nlp_pipeline_Alltext)

In [92]:
df['mainText_verbs'] = df['mainText'].apply(nlp_pipeline)

In [94]:
df['mainText_nouns'] = df['mainText'].apply(nlp_pipeline)

In [95]:
df.head()

Unnamed: 0,doc_id,title,mainText,hyperpartisan,title_mainText,title_preprocessed,mainText_preprocessed,title_mainText_preprocessed,mainText_verbs,mainText_nouns
0,0,Jerry Springer Just Summed Up Trump’s Debate W...,Millions of people tuned in Monday night to wa...,True,Jerry Springer Just Summed Up Trump’s Debate W...,jerry springer summed trump debate perfect tweet,million people tune monday night watch coverag...,jerry springer summed trump debate perfect twe...,"[tune, watch, go, come, report, broadcast, fol...","[million, people, monday, night, coverage, deb..."
1,1,Clinton Campaign Charges College Students $500...,The Clintons understand the average American. ...,True,Clinton Campaign Charges College Students $500...,clinton campaign charge college students atten...,clinton understand average american know like ...,clinton campaign charge college students atten...,"[understand, know, make, suffer, lie, get, pay...","[clinton, broke, hillary, clinton, woman, peop..."
2,2,Migrant Reveals Sick Reason He Left…’to F*ck t...,Harassment is known in Arabic as ‘taharrush’. ...,True,Migrant Reveals Sick Reason He Left…’to F*ck t...,migrant reveals sick reason woman,harassment know arabic taharrush mass sexual a...,migrant reveals sick reason woman harassment k...,"[know, document, begin, call, hide, feel, leav...","[harassment, arabic, taharrush, mass, assault,..."
3,3,Trump Said Obama Gave Iran “$150 Billion.” He ...,Democratic President Barack Obama pulled off a...,True,Trump Said Obama Gave Iran “$150 Billion.” He ...,trump said obama gave iran give nothing,democratic president barack obama pull histori...,trump said obama gave iran give nothing democr...,"[pull, engineer, freeze, lift, strangle, engin...","[president, barack, obama, coup, accord, iran,..."
4,4,Hillary Accused Trump Of Calling Climate Chang...,Democratic nominee Hillary Clinton knows her f...,True,Hillary Accused Trump Of Calling Climate Chang...,hillary accuse trump calling climate change ch...,democratic nominee hillary clinton know fact t...,hillary accuse trump calling climate change ch...,"[know, bust, call, rely, slam, rat, call, repe...","[nominee, hillary, clinton, fact, debate, trum..."


In [96]:
len(df)

2234

In [52]:
#delete unnecessary columns
df = df[['doc_id','title', 'mainText', 'orientation', 'veracity', 'title_mainText', 'title_preprocessed', 
        'mainText_preprocessed','title_mainText_preprocessed','mainText_nouns','mainText_verbs']]

In [98]:
df.to_csv("hyper_preprocessed.csv",index=False)

# Back up

In [3]:
import xml.etree.ElementTree as ET
tree = ET.parse('BuzzFeed-Webis/0002.xml')
root = tree.getroot()
article = root.find("mainText").text
print (article)

One day after explosive devices were discovered in the Manhattan neighborhood of Chelsea and in Seaside Park and Elizabeth in New Jersey, Republican nominee Donald Trump repeated his calls to implement police profiling to stop more attacks in the United States. "Our local police, they know who a lot of these people are. They are afraid to do anything about it because they don't want to be accused of profiling and they don't want to be accused of all sorts of things," Trump said on "Fox and Friends" when asked what policies he would implement as president to "get tough" on terrorism. He argued that the country had no other choice but to follow the lead of Israel. "Israel has done an unbelievable job, and they will profile. They profile. They see somebody that's suspicious," he said, "they will profile. They will take that person in and check out. Do we have a choice? Look what's going on. Do we really have a choice? We're trying to be so politically correct in our country, and this is o

In [4]:
# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]
print (sentence)

"Our local police, they know who a lot of these people are.


In [5]:
# word tokenizer
tokenized_sentence = nltk.word_tokenize(article)
print (tokenized_sentence)

['One', 'day', 'after', 'explosive', 'devices', 'were', 'discovered', 'in', 'the', 'Manhattan', 'neighborhood', 'of', 'Chelsea', 'and', 'in', 'Seaside', 'Park', 'and', 'Elizabeth', 'in', 'New', 'Jersey', ',', 'Republican', 'nominee', 'Donald', 'Trump', 'repeated', 'his', 'calls', 'to', 'implement', 'police', 'profiling', 'to', 'stop', 'more', 'attacks', 'in', 'the', 'United', 'States', '.', '``', 'Our', 'local', 'police', ',', 'they', 'know', 'who', 'a', 'lot', 'of', 'these', 'people', 'are', '.', 'They', 'are', 'afraid', 'to', 'do', 'anything', 'about', 'it', 'because', 'they', 'do', "n't", 'want', 'to', 'be', 'accused', 'of', 'profiling', 'and', 'they', 'do', "n't", 'want', 'to', 'be', 'accused', 'of', 'all', 'sorts', 'of', 'things', ',', "''", 'Trump', 'said', 'on', '``', 'Fox', 'and', 'Friends', "''", 'when', 'asked', 'what', 'policies', 'he', 'would', 'implement', 'as', 'president', 'to', '``', 'get', 'tough', "''", 'on', 'terrorism', '.', 'He', 'argued', 'that', 'the', 'country',

In [6]:
#nltk.download('averaged_perceptron_tagger')
# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

[('One', 'CD'), ('day', 'NN'), ('after', 'IN'), ('explosive', 'JJ'), ('devices', 'NNS'), ('were', 'VBD'), ('discovered', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('Manhattan', 'NNP'), ('neighborhood', 'NN'), ('of', 'IN'), ('Chelsea', 'NNP'), ('and', 'CC'), ('in', 'IN'), ('Seaside', 'NNP'), ('Park', 'NNP'), ('and', 'CC'), ('Elizabeth', 'NNP'), ('in', 'IN'), ('New', 'NNP'), ('Jersey', 'NNP'), (',', ','), ('Republican', 'NNP'), ('nominee', 'NN'), ('Donald', 'NNP'), ('Trump', 'NNP'), ('repeated', 'VBD'), ('his', 'PRP$'), ('calls', 'NNS'), ('to', 'TO'), ('implement', 'VB'), ('police', 'NN'), ('profiling', 'VBG'), ('to', 'TO'), ('stop', 'VB'), ('more', 'JJR'), ('attacks', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('.', '.'), ('``', '``'), ('Our', 'PRP$'), ('local', 'JJ'), ('police', 'NN'), (',', ','), ('they', 'PRP'), ('know', 'VBP'), ('who', 'WP'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('these', 'DT'), ('people', 'NNS'), ('are', 'VBP'), ('.', '.'), ('They',

In [7]:
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = [wordnet_lemmatizer.lemmatize(token.lower(),"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in pos_sentence ]

In [10]:
# without punctuation
import string

# defining punctuation to be removed
punctuation = list(string.punctuation)
punctuation.append("''")
punctuation.append("``")
print (punctuation)

#Remove punctuation
without_punct_sentence = [token for token in text if token not in punctuation]
print (without_punct_sentence)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', "''", '``']
['one', 'day', 'after', 'explosive', 'device', 'be', 'discover', 'in', 'the', 'manhattan', 'neighborhood', 'of', 'chelsea', 'and', 'in', 'seaside', 'park', 'and', 'elizabeth', 'in', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeat', 'his', 'call', 'to', 'implement', 'police', 'profile', 'to', 'stop', 'more', 'attack', 'in', 'the', 'united', 'state', 'our', 'local', 'police', 'they', 'know', 'who', 'a', 'lot', 'of', 'these', 'people', 'be', 'they', 'be', 'afraid', 'to', 'do', 'anything', 'about', 'it', 'because', 'they', 'do', "n't", 'want', 'to', 'be', 'accuse', 'of', 'profile', 'and', 'they', 'do', "n't", 'want', 'to', 'be', 'accuse', 'of', 'all', 'sort', 'of', 'thing', 'trump', 'say', 'on', 'fox', 'and', 'friend', 'when', 'ask', 'what', 'policy', 'he', 'would', 'implement', 'a', 'president', '

In [11]:
text = [token for token in without_punct_sentence if token.isalpha()]
print(text)

['one', 'day', 'after', 'explosive', 'device', 'be', 'discover', 'in', 'the', 'manhattan', 'neighborhood', 'of', 'chelsea', 'and', 'in', 'seaside', 'park', 'and', 'elizabeth', 'in', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeat', 'his', 'call', 'to', 'implement', 'police', 'profile', 'to', 'stop', 'more', 'attack', 'in', 'the', 'united', 'state', 'our', 'local', 'police', 'they', 'know', 'who', 'a', 'lot', 'of', 'these', 'people', 'be', 'they', 'be', 'afraid', 'to', 'do', 'anything', 'about', 'it', 'because', 'they', 'do', 'want', 'to', 'be', 'accuse', 'of', 'profile', 'and', 'they', 'do', 'want', 'to', 'be', 'accuse', 'of', 'all', 'sort', 'of', 'thing', 'trump', 'say', 'on', 'fox', 'and', 'friend', 'when', 'ask', 'what', 'policy', 'he', 'would', 'implement', 'a', 'president', 'to', 'get', 'tough', 'on', 'terrorism', 'he', 'argue', 'that', 'the', 'country', 'have', 'no', 'other', 'choice', 'but', 'to', 'follow', 'the', 'lead', 'of', 'israel', 'israel', 'have', 'd

In [15]:
from nltk.corpus import stopwords
stop_word_list = stopwords.words('english')
#print (stop_word_list)

# removing stopwords
without_stopwords_sentence = [word for word in text if word not in stop_word_list]
print (without_stopwords_sentence)

['one', 'day', 'explosive', 'device', 'discover', 'manhattan', 'neighborhood', 'chelsea', 'seaside', 'park', 'elizabeth', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeat', 'call', 'implement', 'police', 'profile', 'stop', 'attack', 'united', 'state', 'local', 'police', 'know', 'lot', 'people', 'afraid', 'anything', 'want', 'accuse', 'profile', 'want', 'accuse', 'sort', 'thing', 'trump', 'say', 'fox', 'friend', 'ask', 'policy', 'would', 'implement', 'president', 'get', 'tough', 'terrorism', 'argue', 'country', 'choice', 'follow', 'lead', 'israel', 'israel', 'unbelievable', 'job', 'profile', 'profile', 'see', 'somebody', 'suspicious', 'say', 'profile', 'take', 'person', 'check', 'choice', 'look', 'go', 'really', 'choice', 'try', 'politically', 'correct', 'country', 'go', 'get', 'worse', 'trump', 'previously', 'make', 'similar', 'comment', 'orlando', 'nightclub', 'shoot', 'june', 'say', 'interview', 'face', 'nation', 'something', 'need', 'seriously', 'consider']


In [24]:
# lowering words
lowercased_sentence = [word.lower() for word in tokenized_sentence]
print (lowercased_sentence)

['one', 'day', 'after', 'explosive', 'devices', 'were', 'discovered', 'in', 'the', 'manhattan', 'neighborhood', 'of', 'chelsea', 'and', 'in', 'seaside', 'park', 'and', 'elizabeth', 'in', 'new', 'jersey', ',', 'republican', 'nominee', 'donald', 'trump', 'repeated', 'his', 'calls', 'to', 'implement', 'police', 'profiling', 'to', 'stop', 'more', 'attacks', 'in', 'the', 'united', 'states', '.', '``', 'our', 'local', 'police', ',', 'they', 'know', 'who', 'a', 'lot', 'of', 'these', 'people', 'are', '.', 'they', 'are', 'afraid', 'to', 'do', 'anything', 'about', 'it', 'because', 'they', 'do', "n't", 'want', 'to', 'be', 'accused', 'of', 'profiling', 'and', 'they', 'do', "n't", 'want', 'to', 'be', 'accused', 'of', 'all', 'sorts', 'of', 'things', ',', "''", 'trump', 'said', 'on', '``', 'fox', 'and', 'friends', "''", 'when', 'asked', 'what', 'policies', 'he', 'would', 'implement', 'as', 'president', 'to', '``', 'get', 'tough', "''", 'on', 'terrorism', '.', 'he', 'argued', 'that', 'the', 'country',

In [9]:
# without punctuation
import string

# defining punctuation to be removed
punctuation = list(string.punctuation)
punctuation.append("''")
punctuation.append("``")
print (punctuation)

#Remove punctuation
without_punct_sentence = [token for token in lowercased_sentence if token not in punctuation]
print (without_punct_sentence)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', "''", '``']


NameError: name 'lowercased_sentence' is not defined

In [26]:
from nltk.corpus import stopwords
stop_word_list = stopwords.words('english')
#print (stop_word_list)

# removing stopwords
without_stopwords_sentence = [word for word in without_punct_sentence if word not in stop_word_list]
print (without_stopwords_sentence)

['one', 'day', 'explosive', 'devices', 'discovered', 'manhattan', 'neighborhood', 'chelsea', 'seaside', 'park', 'elizabeth', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeated', 'calls', 'implement', 'police', 'profiling', 'stop', 'attacks', 'united', 'states', 'local', 'police', 'know', 'lot', 'people', 'afraid', 'anything', "n't", 'want', 'accused', 'profiling', "n't", 'want', 'accused', 'sorts', 'things', 'trump', 'said', 'fox', 'friends', 'asked', 'policies', 'would', 'implement', 'president', 'get', 'tough', 'terrorism', 'argued', 'country', 'choice', 'follow', 'lead', 'israel', 'israel', 'done', 'unbelievable', 'job', 'profile', 'profile', 'see', 'somebody', "'s", 'suspicious', 'said', 'profile', 'take', 'person', 'check', 'choice', 'look', "'s", 'going', 'really', 'choice', "'re", 'trying', 'politically', 'correct', 'country', 'going', 'get', 'worse', 'trump', 'previously', 'made', 'similar', 'comments', 'orlando', 'nightclub', 'shooting', 'june', 'said', 'in

In [27]:
# keeping words (alpha is a "word" not a number)

only_words_sentence = [word for word in without_stopwords_sentence if word.isalpha()]
print (only_words_sentence)

['one', 'day', 'explosive', 'devices', 'discovered', 'manhattan', 'neighborhood', 'chelsea', 'seaside', 'park', 'elizabeth', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeated', 'calls', 'implement', 'police', 'profiling', 'stop', 'attacks', 'united', 'states', 'local', 'police', 'know', 'lot', 'people', 'afraid', 'anything', 'want', 'accused', 'profiling', 'want', 'accused', 'sorts', 'things', 'trump', 'said', 'fox', 'friends', 'asked', 'policies', 'would', 'implement', 'president', 'get', 'tough', 'terrorism', 'argued', 'country', 'choice', 'follow', 'lead', 'israel', 'israel', 'done', 'unbelievable', 'job', 'profile', 'profile', 'see', 'somebody', 'suspicious', 'said', 'profile', 'take', 'person', 'check', 'choice', 'look', 'going', 'really', 'choice', 'trying', 'politically', 'correct', 'country', 'going', 'get', 'worse', 'trump', 'previously', 'made', 'similar', 'comments', 'orlando', 'nightclub', 'shooting', 'june', 'said', 'interview', 'face', 'nation', 'some

In [28]:
# stemming

# import the library
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

stem_sentence = [snowball_stemmer.stem(word) for word in only_words_sentence]
print (stem_sentence)

['one', 'day', 'explos', 'devic', 'discov', 'manhattan', 'neighborhood', 'chelsea', 'seasid', 'park', 'elizabeth', 'new', 'jersey', 'republican', 'nomine', 'donald', 'trump', 'repeat', 'call', 'implement', 'polic', 'profil', 'stop', 'attack', 'unit', 'state', 'local', 'polic', 'know', 'lot', 'peopl', 'afraid', 'anyth', 'want', 'accus', 'profil', 'want', 'accus', 'sort', 'thing', 'trump', 'said', 'fox', 'friend', 'ask', 'polici', 'would', 'implement', 'presid', 'get', 'tough', 'terror', 'argu', 'countri', 'choic', 'follow', 'lead', 'israel', 'israel', 'done', 'unbeliev', 'job', 'profil', 'profil', 'see', 'somebodi', 'suspici', 'said', 'profil', 'take', 'person', 'check', 'choic', 'look', 'go', 'realli', 'choic', 'tri', 'polit', 'correct', 'countri', 'go', 'get', 'wors', 'trump', 'previous', 'made', 'similar', 'comment', 'orlando', 'nightclub', 'shoot', 'june', 'said', 'interview', 'face', 'nation', 'someth', 'need', 'serious', 'consid']


In [29]:
# lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
lemma_sent = [wordnet_lemmatizer.lemmatize(word) for word in only_words_sentence]
print (lemma_sent)

['one', 'day', 'explosive', 'device', 'discovered', 'manhattan', 'neighborhood', 'chelsea', 'seaside', 'park', 'elizabeth', 'new', 'jersey', 'republican', 'nominee', 'donald', 'trump', 'repeated', 'call', 'implement', 'police', 'profiling', 'stop', 'attack', 'united', 'state', 'local', 'police', 'know', 'lot', 'people', 'afraid', 'anything', 'want', 'accused', 'profiling', 'want', 'accused', 'sort', 'thing', 'trump', 'said', 'fox', 'friend', 'asked', 'policy', 'would', 'implement', 'president', 'get', 'tough', 'terrorism', 'argued', 'country', 'choice', 'follow', 'lead', 'israel', 'israel', 'done', 'unbelievable', 'job', 'profile', 'profile', 'see', 'somebody', 'suspicious', 'said', 'profile', 'take', 'person', 'check', 'choice', 'look', 'going', 'really', 'choice', 'trying', 'politically', 'correct', 'country', 'going', 'get', 'worse', 'trump', 'previously', 'made', 'similar', 'comment', 'orlando', 'nightclub', 'shooting', 'june', 'said', 'interview', 'face', 'nation', 'something', 'n

In [30]:
#nltk.download('averaged_perceptron_tagger')
# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(lemma_sent)

print (pos_sentence)

[('one', 'CD'), ('day', 'NN'), ('explosive', 'JJ'), ('device', 'NN'), ('discovered', 'VBD'), ('manhattan', 'JJ'), ('neighborhood', 'NN'), ('chelsea', 'NN'), ('seaside', 'NN'), ('park', 'NN'), ('elizabeth', 'VBZ'), ('new', 'JJ'), ('jersey', 'JJ'), ('republican', 'JJ'), ('nominee', 'NN'), ('donald', 'NN'), ('trump', 'NN'), ('repeated', 'VBD'), ('call', 'JJ'), ('implement', 'JJ'), ('police', 'NN'), ('profiling', 'VBG'), ('stop', 'JJ'), ('attack', 'NN'), ('united', 'JJ'), ('state', 'NN'), ('local', 'JJ'), ('police', 'NN'), ('know', 'VBD'), ('lot', 'NN'), ('people', 'NNS'), ('afraid', 'VBP'), ('anything', 'NN'), ('want', 'VBP'), ('accused', 'VBN'), ('profiling', 'VBG'), ('want', 'NN'), ('accused', 'VBD'), ('sort', 'JJ'), ('thing', 'NN'), ('trump', 'NN'), ('said', 'VBD'), ('fox', 'JJ'), ('friend', 'NN'), ('asked', 'VBD'), ('policy', 'NN'), ('would', 'MD'), ('implement', 'VB'), ('president', 'NN'), ('get', 'VB'), ('tough', 'JJ'), ('terrorism', 'NN'), ('argued', 'VBD'), ('country', 'NN'), ('ch