# Imports

In [148]:
import pandas as pd
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Data Reading

In [149]:
data_frame = pd.read_csv("../Dataset/articles.csv")
data_frame.head()

# data_frame = data_frame.head(100)

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


# Data Preprocessing
## 1- Handel Missing values
### a- Handel columns with missing values

In [150]:
# Get total number of nulls in each column
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0       0.000
id               0.000
title            0.000
publication      0.000
author          12.612
date             0.000
year             0.000
month            0.000
url            100.000
content          0.000
dtype: float64

In [151]:
# Remove url column because it's empty(100% null values)
data_frame.drop(columns=['url'], inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0      0.000
id              0.000
title           0.000
publication     0.000
author         12.612
date            0.000
year            0.000
month           0.000
content         0.000
dtype: float64

### b- Handel rows with missing values

In [152]:
# Get percentage of number of rows contain missing values
mask = data_frame.isnull().any(axis=1)
rows_with_missing_values = mask.sum() / len(data_frame)*100
print("Number of rows contain null values =", rows_with_missing_values, "%")

Number of rows contain null values = 12.612000000000002 %


In [153]:
# Drop rows contain null values
data_frame.dropna(inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         0.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

## 2- Handel columns data types

In [154]:
data_frame.dtypes

Unnamed: 0       int64
id               int64
title           object
publication     object
author          object
date            object
year           float64
month          float64
content         object
dtype: object

In [155]:
data_frame= data_frame.astype({"title": "string",
                               "publication": "string",
                               "author": "string",
                               "date": "datetime64",
                               "year": "int64",
                               "month": "int64",
                               "content": "string"})

data_frame.dtypes

Unnamed: 0              int64
id                      int64
title                  string
publication            string
author                 string
date           datetime64[ns]
year                    int64
month                   int64
content                string
dtype: object

## 3- Dealing with unnecessary columns

In [156]:
# Drop id column because it contain unnecessary unique values
data_frame.drop(columns=['id'], inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,date,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


In [157]:
# extract day from date column and rename column to day
data_frame['date'] = data_frame['date'].astype('string')
data_frame['date'] = data_frame['date'].str.replace("-", "/")
data_frame[["year", "month", "day"]] = data_frame["date"].str.split("/", expand = True)
data_frame['date'] = data_frame['day']
data_frame.drop(columns=["day"], inplace=True)
data_frame.rename(columns={"date": "day"}, inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


# Text Preprocessing NLP Pipeline
## 1- Convert to lowercase

In [158]:
columns = ["title", "publication", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.lower()

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,house republicans fret about winning their hea...,new york times,carl hulse,31,2016,12,washington — congressional republicans have...
1,1,rift between officers and residents as killing...,new york times,benjamin mueller and al baker,19,2017,6,"after the bullet shells get counted, the blood..."
2,2,"tyrus wong, ‘bambi’ artist thwarted by racial ...",new york times,margalit fox,6,2017,1,"when walt disney’s “bambi” opened in 1942, cri..."
3,3,"among deaths in 2016, a heavy toll in pop musi...",new york times,william mcdonald,10,2017,4,"death may be the great equalizer, but it isn’t..."
4,4,kim jong-un says north korea is preparing to t...,new york times,choe sang-hun,2,2017,1,"seoul, south korea — north korea’s leader, ..."


## 2- Remove HTML Tags

In [159]:
def remove_HTML_tags(text):
    return re.sub(r'<.*?>', "", text)

columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_HTML_tags(x))

## 3- Remove URLs

In [160]:
def remove_URLs(text):
    return re.sub(r'https?://\S+www\.\S+', "", text)

columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_URLs(x))

## 4- Remove Punctuations and Special Characters

In [161]:
# columns = ["title", "author", "content"]
columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood...
2,tyrus wong bambi artist thwarted by racial ...,when walt disney s bambi opened in 1942 cri...
3,among deaths in 2016 a heavy toll in pop musi...,death may be the great equalizer but it isn t...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader ...
...,...,...
49995,rex tillerson says climate change is real but,as chairman and ceo of exxonmobil rex tillers...
49996,the biggest intelligence questions raised by t...,i ve spent nearly 20 years looking at intellig...
49997,trump announces plan that does little to resol...,donald trump will not be taking necessary st...
49998,dozens of for profit colleges could soon close,dozens of colleges could be forced to close ...


## 5- Remove unnecessary spaces and words

In [162]:
# remove unnecessary spaces
columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.replace('\s\s+', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have a ne...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood ...
2,tyrus wong bambi artist thwarted by racial bia...,when walt disney s bambi opened in 1942 critic...
3,among deaths in 2016 a heavy toll in pop music...,death may be the great equalizer but it isn t ...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader kim sai...
...,...,...
49995,rex tillerson says climate change is real but,as chairman and ceo of exxonmobil rex tillerso...
49996,the biggest intelligence questions raised by t...,i ve spent nearly 20 years looking at intellig...
49997,trump announces plan that does little to resol...,donald trump will not be taking necessary ste...
49998,dozens of for profit colleges could soon close,dozens of colleges could be forced to close in...


In [163]:
# remove unnecessary words
# remove publication from title
publication_unique_values = data_frame["publication"].unique()
for publication in publication_unique_values:
    data_frame["title"] = data_frame['title'].str.replace("." + publication + "$", "", regex=True)

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,house republicans fret about winning their hea...,new york times,carl hulse,31,2016,12,washington congressional republicans have a ne...
1,1,rift between officers and residents as killing...,new york times,benjamin mueller and al baker,19,2017,6,after the bullet shells get counted the blood ...
2,2,tyrus wong bambi artist thwarted by racial bia...,new york times,margalit fox,6,2017,1,when walt disney s bambi opened in 1942 critic...
3,3,among deaths in 2016 a heavy toll in pop music...,new york times,william mcdonald,10,2017,4,death may be the great equalizer but it isn t ...
4,4,kim jong un says north korea is preparing to t...,new york times,choe sang-hun,2,2017,1,seoul south korea north korea s leader kim sai...


## 6- Apply Tokenization
### a- Apply sentence tokenization

In [164]:
# def sentence_tokenizer(text):
#     return sent_tokenize(text)
#
# columns = ["title", "content"]
# for column in columns:
#     data_frame[column] = data_frame[column].apply(lambda x: sentence_tokenizer(x))
#
# data_frame[['title', 'content']]

### b- Apply word tokenization

In [165]:
# def word_tokenizer(sentences):
#     tokenized_words = []
#     for x in sentences:
#         tokenized_words = tokenized_words + word_tokenize(x)
#     return tokenized_words

def word_tokenizer(text):
    return word_tokenize(text)
columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: word_tokenizer(x))

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,"[house, republicans, fret, about, winning, the...",new york times,carl hulse,31,2016,12,"[washington, congressional, republicans, have,..."
1,1,"[rift, between, officers, and, residents, as, ...",new york times,benjamin mueller and al baker,19,2017,6,"[after, the, bullet, shells, get, counted, the..."
2,2,"[tyrus, wong, bambi, artist, thwarted, by, rac...",new york times,margalit fox,6,2017,1,"[when, walt, disney, s, bambi, opened, in, 194..."
3,3,"[among, deaths, in, 2016, a, heavy, toll, in, ...",new york times,william mcdonald,10,2017,4,"[death, may, be, the, great, equalizer, but, i..."
4,4,"[kim, jong, un, says, north, korea, is, prepar...",new york times,choe sang-hun,2,2017,1,"[seoul, south, korea, north, korea, s, leader,..."


## 7- Remove Stop Words

In [166]:
STOPWORDS = set(stopwords.words("english"))
def remove_stop_words(text):
    return [word for word in text if word not in STOPWORDS]

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_stop_words(x))

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,"[house, republicans, fret, winning, health, ca...",new york times,carl hulse,31,2016,12,"[washington, congressional, republicans, new, ..."
1,1,"[rift, officers, residents, killings, persist,...",new york times,benjamin mueller and al baker,19,2017,6,"[bullet, shells, get, counted, blood, dries, v..."
2,2,"[tyrus, wong, bambi, artist, thwarted, racial,...",new york times,margalit fox,6,2017,1,"[walt, disney, bambi, opened, 1942, critics, p..."
3,3,"[among, deaths, 2016, heavy, toll, pop, music]",new york times,william mcdonald,10,2017,4,"[death, may, great, equalizer, necessarily, ev..."
4,4,"[kim, jong, un, says, north, korea, preparing,...",new york times,choe sang-hun,2,2017,1,"[seoul, south, korea, north, korea, leader, ki..."


## 8- Remove frequently words

In [167]:
word_counter_title = Counter()
word_counter_content = Counter()
def get_most_frequently_words(column_name, word_counter):
    for text in data_frame[column_name]:
        for word in text:
            word_counter[word] += 1
    return word_counter

# get the most common frequently words in [title] more than 1000
word_frequency_title = get_most_frequently_words("title", word_counter_title).most_common(10)

# get the most common frequently words in [content]
word_frequency_content = get_most_frequently_words("content", word_counter_content).most_common(10)

In [168]:
FREQUENT_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common(10))
FREQUENT_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common(10))

def remove_frequent_words(string_text, frequent_words):
    return [word for word in string_text if word not in frequent_words]

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_TITLE))

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,"[house, republicans, fret, winning, health, ca...",new york times,carl hulse,31,2016,12,"[washington, congressional, republicans, fear,..."
1,1,"[rift, officers, residents, killings, persist,...",new york times,benjamin mueller and al baker,19,2017,6,"[bullet, shells, get, counted, blood, dries, v..."
2,2,"[tyrus, wong, bambi, artist, thwarted, racial,...",new york times,margalit fox,6,2017,1,"[walt, disney, bambi, opened, 1942, critics, p..."
3,3,"[among, deaths, 2016, heavy, toll, pop, music]",new york times,william mcdonald,10,2017,4,"[death, may, great, equalizer, necessarily, ev..."
4,4,"[kim, jong, un, north, korea, preparing, test,...",new york times,choe sang-hun,2,2017,1,"[seoul, south, korea, north, korea, leader, ki..."


## 9- Remove rare words

In [169]:
RARE_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common()[:-10:-1])
RARE_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common()[:-10:-1])

def remove_rare_words(string_text, rare_words):
    return [word for word in string_text if word not in rare_words]

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_rare_words(x, RARE_WORDS_TITLE))

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,"[house, republicans, fret, winning, health, ca...",new york times,carl hulse,31,2016,12,"[washington, congressional, republicans, fear,..."
1,1,"[rift, officers, residents, killings, persist,...",new york times,benjamin mueller and al baker,19,2017,6,"[bullet, shells, get, counted, blood, dries, v..."
2,2,"[tyrus, wong, bambi, artist, thwarted, racial,...",new york times,margalit fox,6,2017,1,"[walt, disney, bambi, opened, 1942, critics, p..."
3,3,"[among, deaths, 2016, heavy, toll, pop, music]",new york times,william mcdonald,10,2017,4,"[death, may, great, equalizer, necessarily, ev..."
4,4,"[kim, jong, un, north, korea, preparing, test,...",new york times,choe sang-hun,2,2017,1,"[seoul, south, korea, north, korea, leader, ki..."


## 10- Apply Stemming

In [170]:
# porter_stemmer_title = PorterStemmer()
# porter_stemmer_content = PorterStemmer()
# def stemming_words(text, porter_stemmer):
#     return [porter_stemmer.stem(word) for word in text]
#
# columns = ["title", "content"]
# for column in columns:
#     data_frame[column] = data_frame[column].apply(lambda x: stemming_words(x, porter_stemmer_title))
#
# data_frame.head()

## 11- Apply POS and Lemmatization

In [171]:
# lemmatizer_title = WordNetLemmatizer()
# lemmatizer_content = WordNetLemmatizer()
# wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
#
# def lemmatize_words(text, lemmatizer):
#     # apply POS tagging
#     pos_text = pos_tag(text)
#     return [lemmatizer.lemmatize(word, wordnet_map.get(pos[0],  wordnet.NOUN)) for word, pos in pos_text]
#
# columns = ["title", "content"]
# for column in columns:
#     data_frame[column] = data_frame[column].apply(lambda x: lemmatize_words(x, lemmatizer_title))

# Data Preprocessing
## 1- Dealing with categorical data (Nominal Data)

In [172]:
# get number of unique values in publication column
publication_unique_values = data_frame["publication"].unique()
print("Publication unique values = ", len(publication_unique_values))

Publication unique values =  5


In [173]:
# apply one hot encoding in publication column
data_frame = pd.get_dummies(data=data_frame, columns=["publication"])
data_frame.dtypes

Unnamed: 0                       int64
title                           object
author                          object
day                             string
year                            string
month                           string
content                         object
publication_atlantic             uint8
publication_breitbart            uint8
publication_business insider     uint8
publication_cnn                  uint8
publication_new york times       uint8
dtype: object

In [174]:
data_frame["author"] = data_frame["author"].str.replace('[a-z]{1}\.|\(.*\)', " ", regex=True)
# get number of unique values in author column
author_unique_values = Counter()
for text in data_frame["author"]:
    for author_name in re.split('with|and|,|&', text):
        name = author_name.strip()
        # name = re.sub(r'^.\s\s+|\s\s+.$', "", name)
        name = " ".join(name.split())
        if len(name) > 1:
            author_unique_values[name] += 1

print("Author unique values = ", len(author_unique_values))

Author unique values =  2353


In [175]:
def split_authors(text):
    return ','.join([word.strip() for word in re.split('with|and|,|&', text)])

data_frame["author"] = data_frame["author"].apply(lambda x: split_authors(x))

for x in list(author_unique_values.keys()):
    print("*"+x+"*")

*carl hulse*
*benjamin mueller*
*al baker*
*margalit fox*
*william mcdonald*
*choe sang-hun*
*sewell chan*
*javier hernández*
*gina kolata*
*katherine rosman*
*y newman*
*justin gillis*
*john schwartz*
*maggie haberman*
*charles duhigg*
*stephanie rosenbloom*
*emma fitzsimmons*
*kevin sack*
*alan blinder*
*geeta an*
*the associated press*
*brett cole*
*benjamin hoffman*
*patrick healy*
*marlise simons*
*guy trebay*
*jacob bernstein*
*jennifer steinhauer*
*charles mcdermid*
*sheri fink*
*helene cooper*
*tim arango*
*mark l*
*ler*
*michelle higgins*
*isabel kershner*
*alissa rubin*
*ian austen*
*simon romero*
*kenneth chang*
*corey kilgannon*
*jennifer schuessler*
*john otis*
*eric lipton*
*matt flegenheimer*
*sean alfano*
*robert pear*
*binyamin appelbaum*
*scott shane*
*julie hirschfeld davis*
*adam nagourney*
*gretchen reynolds*
*damon darlin*
*jim rutenberg*
*john koblin*
*michael grynbaum*
*chris buckley*
*farhad manjoo*
*adam wu*
*beverly gage*
*vincent mallozzi*
*john grippe*
*tho

In [176]:
# apply mapping in author
# for index, author in enumerate(unique_authors):
#     mapping_author[author] = index
#
# data_frame = data_frame.replace ({
#     "author": mapping_author
# })
# data_frame.head()

# Feature Extraction
## ⚫ Apply TF-IDF

In [177]:
X = data_frame
Y = data_frame
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y ,
                                   random_state=104,
                                   test_size=0.25,
                                   shuffle=True)




In [178]:
TF_IDF = TfidfVectorizer(stop_words='english')

# fit data
TF_IDF.fit(data_frame["title"])

AttributeError: 'list' object has no attribute 'lower'