# Sentiment Classification - Data Preprocessing

In [1]:
# Dependencies
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append('us')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer   
lemmatizer = WordNetLemmatizer() 
#nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Combine Datasets

In [2]:
# Read in initial dataset
df1 = pd.read_json('../data_formatting/data_formatted.json')
df1 = df1.set_index('url')
df1.head()

Unnamed: 0_level_0,keyword,source,author,title,published,compound_score,negative_score,positive_score,neutral_score,text_excerpt,text_complete
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
https://www.nbcnews.com/think/opinion/biden-s-immigration-plan-must-reform-daca-cover-dreamers-whose-ncna1248885,immigration,NBC News,Dip Patel,I could be deported because my parents came he...,2020-12-04T09:32:20Z,0.1027,0.0,0.104,0.896,"As an inpatient hospital pharmacist, I am part...","As an inpatient hospital pharmacist, I am part..."
https://www.nbcnews.com/news/latino/first-latino-tapped-head-dhs-signals-shift-trump-s-hard-n1248716,immigration,NBC News,Suzanne Gamboa,First Latino tapped to head DHS signals shift ...,2020-11-23T22:41:00Z,0.0,0.0,0.0,1.0,"Alejandro Mayorkas, the first Latino chosen fo...","Alejandro Mayorkas, the first Latino chosen fo..."
https://www.nbcnews.com/news/asian-america/tony-pham-interim-director-immigration-customs-enforcement-step-down-n1251159,immigration,NBC News,Cynthia Silva,"Tony Pham, interim director of Immigration and...",2020-12-14T22:28:21Z,0.0,0.0,0.0,1.0,Tony Pham will be stepping down as the acting ...,Tony Pham will be stepping down as the acting ...
https://www.nbcnews.com/politics/immigration/sixty-nine-percent-undocumented-immigrant-workers-have-jobs-essential-fighting-n1251390,immigration,NBC News,Julia Ainsley and Didi Martinez,More than two-thirds of undocumented immigrant...,2020-12-16T22:46:58Z,-0.3818,0.157,0.0,0.843,WASHINGTON More than two-thirds of undocumente...,WASHINGTON — More than two-thirds of undocumen...
https://www.nbcnews.com/politics/politics-news/biden-meet-workers-small-business-owners-economic-crisis-n1249668,immigration,NBC News,"Rebecca Shabad, Adam Edelman","Biden to meet with struggling workers, small-b...",2020-12-02T21:56:00Z,-0.7845,0.365,0.0,0.635,President-elect Joe Biden on Wednesday used a ...,President-elect Joe Biden on Wednesday used a ...


In [3]:
# Read in human ratings
df2 = pd.read_csv('../data_formatting/urls_human_sentiment.csv')
df2 = df2.set_index('url')['sentiment_human']
df2.head()

url
https://www.nbcnews.com/think/opinion/biden-s-immigration-plan-must-reform-daca-cover-dreamers-whose-ncna1248885                       -1
https://www.nbcnews.com/news/latino/first-latino-tapped-head-dhs-signals-shift-trump-s-hard-n1248716                                    0
https://www.nbcnews.com/news/asian-america/tony-pham-interim-director-immigration-customs-enforcement-step-down-n1251159                0
https://www.nbcnews.com/politics/immigration/sixty-nine-percent-undocumented-immigrant-workers-have-jobs-essential-fighting-n1251390    0
https://www.nbcnews.com/politics/politics-news/biden-meet-workers-small-business-owners-economic-crisis-n1249668                       -1
Name: sentiment_human, dtype: int64

In [4]:
# Combine dataframes
rated_df = pd.DataFrame.join(df1, df2, how='inner')
rated_df = rated_df.reset_index()
rated_df.head()

Unnamed: 0,url,keyword,source,author,title,published,compound_score,negative_score,positive_score,neutral_score,text_excerpt,text_complete,sentiment_human
0,https://www.nbcnews.com/think/opinion/biden-s-...,immigration,NBC News,Dip Patel,I could be deported because my parents came he...,2020-12-04T09:32:20Z,0.1027,0.0,0.104,0.896,"As an inpatient hospital pharmacist, I am part...","As an inpatient hospital pharmacist, I am part...",-1
1,https://www.nbcnews.com/news/latino/first-lati...,immigration,NBC News,Suzanne Gamboa,First Latino tapped to head DHS signals shift ...,2020-11-23T22:41:00Z,0.0,0.0,0.0,1.0,"Alejandro Mayorkas, the first Latino chosen fo...","Alejandro Mayorkas, the first Latino chosen fo...",0
2,https://www.nbcnews.com/news/asian-america/ton...,immigration,NBC News,Cynthia Silva,"Tony Pham, interim director of Immigration and...",2020-12-14T22:28:21Z,0.0,0.0,0.0,1.0,Tony Pham will be stepping down as the acting ...,Tony Pham will be stepping down as the acting ...,0
3,https://www.nbcnews.com/politics/immigration/s...,immigration,NBC News,Julia Ainsley and Didi Martinez,More than two-thirds of undocumented immigrant...,2020-12-16T22:46:58Z,-0.3818,0.157,0.0,0.843,WASHINGTON More than two-thirds of undocumente...,WASHINGTON — More than two-thirds of undocumen...,0
4,https://www.nbcnews.com/politics/politics-news...,immigration,NBC News,"Rebecca Shabad, Adam Edelman","Biden to meet with struggling workers, small-b...",2020-12-02T21:56:00Z,-0.7845,0.365,0.0,0.635,President-elect Joe Biden on Wednesday used a ...,President-elect Joe Biden on Wednesday used a ...,-1


In [5]:
# Make sure no ratings are missing
rated_df.count()

url                3093
keyword            3093
source             3093
author             3080
title              3093
published          3093
compound_score     3093
negative_score     3093
positive_score     3093
neutral_score      3093
text_excerpt       3093
text_complete      3093
sentiment_human    3093
dtype: int64

In [6]:
# Check that ratings are correct values
rated_df['sentiment_human'].unique()

array([-1,  0,  1], dtype=int64)

## Text Cleaning
* Tokenize
* Remove stopwords
* Lemmatize

##### To Try Later
* Create bigrams and/or other n-grams

In [7]:
def tokenize_headline(title):

    # Remove punctuation while tokenizing
    tokenizer = RegexpTokenizer(r"\w+")
    toks = tokenizer.tokenize(title)

    # Convert tokens to lowercase and then remove stop words
    #tokens = [t.lower() for t in toks if t.lower() not in stop_words]
    tokens = [t.lower() for t in toks]

    return tokens

In [8]:
# Tokenize headlines
rated_df['tokens'] = rated_df['title'].apply(lambda x: tokenize_headline(x))
rated_df['tokens']

0       [i, could, be, deported, because, my, parents,...
1       [first, latino, tapped, to, head, dhs, signals...
2       [tony, pham, interim, director, of, immigratio...
3       [more, than, two, thirds, of, undocumented, im...
4       [biden, to, meet, with, struggling, workers, s...
                              ...                        
3088    [democrats, aim, to, flip, senate, after, 6, y...
3089    [america, delivers, its, verdict, on, donald, ...
3090    [federal, judge, says, us, can, t, turn, away,...
3091    [5, things, to, know, for, november, 13, covid...
3092    [at, least, three, dead, including, children, ...
Name: tokens, Length: 3093, dtype: object

In [9]:
# Source https://www.machinelearningplus.com/nlp/lemmatization-examples-python/ 
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [10]:
# Lemmatize tokens and remove stopwords
rated_df['tokens_lems'] = rated_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x if word not in stop_words])
rated_df[['tokens', 'tokens_lems']]

Unnamed: 0,tokens,tokens_lems
0,"[i, could, be, deported, because, my, parents,...","[could, deport, parent, come, legally, biden, ..."
1,"[first, latino, tapped, to, head, dhs, signals...","[first, latino, tapped, head, dhs, signal, shi..."
2,"[tony, pham, interim, director, of, immigratio...","[tony, pham, interim, director, immigration, c..."
3,"[more, than, two, thirds, of, undocumented, im...","[two, third, undocumented, immigrant, worker, ..."
4,"[biden, to, meet, with, struggling, workers, s...","[biden, meet, struggle, worker, small, busines..."
...,...,...
3088,"[democrats, aim, to, flip, senate, after, 6, y...","[democrat, aim, flip, senate, 6, year, republi..."
3089,"[america, delivers, its, verdict, on, donald, ...","[america, delivers, verdict, donald, trump]"
3090,"[federal, judge, says, us, can, t, turn, away,...","[federal, judge, say, turn, away, unaccompanie..."
3091,"[5, things, to, know, for, november, 13, covid...","[5, thing, know, november, 13, covid, transiti..."


## TF-IDF (w/ CountVectorizer rather than Hashing)

In [11]:
# Can maybe use extend() to increase efficiency

# Store all words in a list
all_words = []
for index, row in rated_df.iterrows():
    words = row['tokens_lems']
    for word in words:
        all_words.append(word)

# Determine number of unique words
unique_words = set(all_words) 
len(unique_words)

5372

In [12]:
# Create strings out of list of lemmatized tokens
rated_df['cleaned_headlines'] = rated_df['tokens_lems'].apply(lambda x: " ".join(x))
rated_df.head()

Unnamed: 0,url,keyword,source,author,title,published,compound_score,negative_score,positive_score,neutral_score,text_excerpt,text_complete,sentiment_human,tokens,tokens_lems,cleaned_headlines
0,https://www.nbcnews.com/think/opinion/biden-s-...,immigration,NBC News,Dip Patel,I could be deported because my parents came he...,2020-12-04T09:32:20Z,0.1027,0.0,0.104,0.896,"As an inpatient hospital pharmacist, I am part...","As an inpatient hospital pharmacist, I am part...",-1,"[i, could, be, deported, because, my, parents,...","[could, deport, parent, come, legally, biden, ...",could deport parent come legally biden change
1,https://www.nbcnews.com/news/latino/first-lati...,immigration,NBC News,Suzanne Gamboa,First Latino tapped to head DHS signals shift ...,2020-11-23T22:41:00Z,0.0,0.0,0.0,1.0,"Alejandro Mayorkas, the first Latino chosen fo...","Alejandro Mayorkas, the first Latino chosen fo...",0,"[first, latino, tapped, to, head, dhs, signals...","[first, latino, tapped, head, dhs, signal, shi...",first latino tapped head dhs signal shift trum...
2,https://www.nbcnews.com/news/asian-america/ton...,immigration,NBC News,Cynthia Silva,"Tony Pham, interim director of Immigration and...",2020-12-14T22:28:21Z,0.0,0.0,0.0,1.0,Tony Pham will be stepping down as the acting ...,Tony Pham will be stepping down as the acting ...,0,"[tony, pham, interim, director, of, immigratio...","[tony, pham, interim, director, immigration, c...",tony pham interim director immigration custom ...
3,https://www.nbcnews.com/politics/immigration/s...,immigration,NBC News,Julia Ainsley and Didi Martinez,More than two-thirds of undocumented immigrant...,2020-12-16T22:46:58Z,-0.3818,0.157,0.0,0.843,WASHINGTON More than two-thirds of undocumente...,WASHINGTON — More than two-thirds of undocumen...,0,"[more, than, two, thirds, of, undocumented, im...","[two, third, undocumented, immigrant, worker, ...",two third undocumented immigrant worker job es...
4,https://www.nbcnews.com/politics/politics-news...,immigration,NBC News,"Rebecca Shabad, Adam Edelman","Biden to meet with struggling workers, small-b...",2020-12-02T21:56:00Z,-0.7845,0.365,0.0,0.635,President-elect Joe Biden on Wednesday used a ...,President-elect Joe Biden on Wednesday used a ...,-1,"[biden, to, meet, with, struggling, workers, s...","[biden, meet, struggle, worker, small, busines...",biden meet struggle worker small business owne...


In [13]:
# Generate TF-IDF representation
vectorizer = TfidfVectorizer(max_features=6000)
tfidf = vectorizer.fit_transform(rated_df['cleaned_headlines'])

In [15]:
# Convert TF-IDF to Series
tfidf_array = tfidf.toarray()
tf_list = tfidf_array.tolist()
tfidf_series = pd.Series(tf_list).rename('tfidf')

In [23]:
# Add TF-IDF to DataFrame w/ rest of data
tfidf_df = pd.DataFrame.join(rated_df, tfidf_series)
tfidf_df.head()

Unnamed: 0,url,keyword,source,author,title,published,compound_score,negative_score,positive_score,neutral_score,text_excerpt,text_complete,sentiment_human,tokens,tokens_lems,cleaned_headlines,tfidf
0,https://www.nbcnews.com/think/opinion/biden-s-...,immigration,NBC News,Dip Patel,I could be deported because my parents came he...,2020-12-04T09:32:20Z,0.1027,0.0,0.104,0.896,"As an inpatient hospital pharmacist, I am part...","As an inpatient hospital pharmacist, I am part...",-1,"[i, could, be, deported, because, my, parents,...","[could, deport, parent, come, legally, biden, ...",could deport parent come legally biden change,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,https://www.nbcnews.com/news/latino/first-lati...,immigration,NBC News,Suzanne Gamboa,First Latino tapped to head DHS signals shift ...,2020-11-23T22:41:00Z,0.0,0.0,0.0,1.0,"Alejandro Mayorkas, the first Latino chosen fo...","Alejandro Mayorkas, the first Latino chosen fo...",0,"[first, latino, tapped, to, head, dhs, signals...","[first, latino, tapped, head, dhs, signal, shi...",first latino tapped head dhs signal shift trum...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,https://www.nbcnews.com/news/asian-america/ton...,immigration,NBC News,Cynthia Silva,"Tony Pham, interim director of Immigration and...",2020-12-14T22:28:21Z,0.0,0.0,0.0,1.0,Tony Pham will be stepping down as the acting ...,Tony Pham will be stepping down as the acting ...,0,"[tony, pham, interim, director, of, immigratio...","[tony, pham, interim, director, immigration, c...",tony pham interim director immigration custom ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,https://www.nbcnews.com/politics/immigration/s...,immigration,NBC News,Julia Ainsley and Didi Martinez,More than two-thirds of undocumented immigrant...,2020-12-16T22:46:58Z,-0.3818,0.157,0.0,0.843,WASHINGTON More than two-thirds of undocumente...,WASHINGTON — More than two-thirds of undocumen...,0,"[more, than, two, thirds, of, undocumented, im...","[two, third, undocumented, immigrant, worker, ...",two third undocumented immigrant worker job es...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,https://www.nbcnews.com/politics/politics-news...,immigration,NBC News,"Rebecca Shabad, Adam Edelman","Biden to meet with struggling workers, small-b...",2020-12-02T21:56:00Z,-0.7845,0.365,0.0,0.635,President-elect Joe Biden on Wednesday used a ...,President-elect Joe Biden on Wednesday used a ...,-1,"[biden, to, meet, with, struggling, workers, s...","[biden, meet, struggle, worker, small, busines...",biden meet struggle worker small business owne...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Add Other Features
* Sentiment values (customize dictionary - negative words include "nazi", "hitler", and "slam")
* Newspaper bias
* Newspaper

## Split Dataset into Testing/Training
Save these to CSV files