# Imports

In [351]:
import pandas as pd
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Data Reading

In [352]:
data_frame = pd.read_csv("../Dataset/articles.csv")
data_frame.head()

data_frame = data_frame.head(100)

# Data Preprocessing
## 1- Handel Missing values
### a- Handel columns with missing values

In [353]:
# Get total number of nulls in each column
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0       0.0
id               0.0
title            0.0
publication      0.0
author           2.0
date             0.0
year             0.0
month            0.0
url            100.0
content          0.0
dtype: float64

In [354]:
# Remove url column because it's empty(100% null values)
data_frame.drop(columns=['url'], inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         2.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

### b- Handel rows with missing values

In [355]:
# Get percentage of number of rows contain missing values
mask = data_frame.isnull().any(axis=1)
rows_with_missing_values = mask.sum() / len(data_frame)*100
print("Number of rows contain null values =", rows_with_missing_values, "%")

Number of rows contain null values = 2.0 %


In [356]:
# Drop rows contain null values
data_frame.dropna(inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         0.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

## 2- Handel columns data types

In [357]:
data_frame.dtypes

Unnamed: 0       int64
id               int64
title           object
publication     object
author          object
date            object
year           float64
month          float64
content         object
dtype: object

In [358]:
data_frame= data_frame.astype({"title": "string",
                               "publication": "string",
                               "author": "string",
                               "date": "datetime64",
                               "year": "int64",
                               "month": "int64",
                               "content": "string"})

data_frame.dtypes

Unnamed: 0              int64
id                      int64
title                  string
publication            string
author                 string
date           datetime64[ns]
year                    int64
month                   int64
content                string
dtype: object

## 3- Dealing with unnecessary columns

In [359]:
# Drop id column because it contain unnecessary unique values
data_frame.drop(columns=['id'], inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,date,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


In [360]:
# extract day from date column and rename column to day
data_frame['date'] = data_frame['date'].astype('string')
data_frame['date'] = data_frame['date'].str.replace("-", "/")
data_frame[["year", "month", "day"]] = data_frame["date"].str.split("/", expand = True)
data_frame['date'] = data_frame['day']
data_frame.drop(columns=["day"], inplace=True)
data_frame.rename(columns={"date": "day"}, inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


# Text Preprocessing NLP Pipeline
## 1- Convert to lowercase

In [361]:
columns = ["title", "publication", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.lower()

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,house republicans fret about winning their hea...,new york times,carl hulse,31,2016,12,washington — congressional republicans have...
1,1,rift between officers and residents as killing...,new york times,benjamin mueller and al baker,19,2017,6,"after the bullet shells get counted, the blood..."
2,2,"tyrus wong, ‘bambi’ artist thwarted by racial ...",new york times,margalit fox,6,2017,1,"when walt disney’s “bambi” opened in 1942, cri..."
3,3,"among deaths in 2016, a heavy toll in pop musi...",new york times,william mcdonald,10,2017,4,"death may be the great equalizer, but it isn’t..."
4,4,kim jong-un says north korea is preparing to t...,new york times,choe sang-hun,2,2017,1,"seoul, south korea — north korea’s leader, ..."


## 2- Remove HTML Tags

In [362]:
def remove_HTML_tags(text):
    return re.sub(r'<.*?>', "", text)

columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_HTML_tags(x))

## 3- Remove URLs

In [363]:
def remove_URLs(text):
    return re.sub(r'https?://\S+www\.\S+', "", text)

columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_URLs(x))

## 4- Remove Punctuations and Special Characters

In [364]:
columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood...
2,tyrus wong bambi artist thwarted by racial ...,when walt disney s bambi opened in 1942 cri...
3,among deaths in 2016 a heavy toll in pop musi...,death may be the great equalizer but it isn t...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader ...
...,...,...
95,corzine reaches 5 million settlement with reg...,after more than five years of investigations a...
96,sears agrees to sell craftsman to stanley blac...,the question from the analyst on thursday was ...
97,l i r r train that crashed was going over twi...,a long island rail road train that crashed in ...
98,record 2016 for u s auto industry long road b...,detroit unexpectedly strong sales of new ...


## 5- Remove unnecessary spaces and words

In [365]:
# remove unnecessary spaces
columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].str.replace('\s\s+', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have a ne...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood ...
2,tyrus wong bambi artist thwarted by racial bia...,when walt disney s bambi opened in 1942 critic...
3,among deaths in 2016 a heavy toll in pop music...,death may be the great equalizer but it isn t ...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader kim sai...
...,...,...
95,corzine reaches 5 million settlement with regu...,after more than five years of investigations a...
96,sears agrees to sell craftsman to stanley blac...,the question from the analyst on thursday was ...
97,l i r r train that crashed was going over twic...,a long island rail road train that crashed in ...
98,record 2016 for u s auto industry long road ba...,detroit unexpectedly strong sales of new vehic...


In [366]:
# remove unnecessary words
# remove publication from title
publication_unique_values = data_frame["publication"].unique()
for publication in publication_unique_values:
    data_frame["title"] = data_frame['title'].str.replace("." + publication + "$", "", regex=True)

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,house republicans fret about winning their hea...,new york times,carl hulse,31,2016,12,washington congressional republicans have a ne...
1,1,rift between officers and residents as killing...,new york times,benjamin mueller and al baker,19,2017,6,after the bullet shells get counted the blood ...
2,2,tyrus wong bambi artist thwarted by racial bia...,new york times,margalit fox,6,2017,1,when walt disney s bambi opened in 1942 critic...
3,3,among deaths in 2016 a heavy toll in pop music...,new york times,william mcdonald,10,2017,4,death may be the great equalizer but it isn t ...
4,4,kim jong un says north korea is preparing to t...,new york times,choe sang hun,2,2017,1,seoul south korea north korea s leader kim sai...


## 6- Apply Tokenization
### a- Apply sentence tokenization

In [367]:
# def sentence_tokenizer(text):
#     return sent_tokenize(text)
#
# columns = ["title", "content"]
# for column in columns:
#     data_frame[column] = data_frame[column].apply(lambda x: sentence_tokenizer(x))
#
# data_frame[['title', 'content']]

### b- Apply word tokenization

In [382]:
# def word_tokenizer(sentences):
#     tokenized_words = []
#     for x in sentences:
#         tokenized_words = tokenized_words + word_tokenize(x)
#     return tokenized_words

def word_tokenizer(text):
    return word_tokenize(text)
columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: word_tokenizer(x))

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_new york times
0,0,"[h, u, p, u, b, c, f, w, g, h, h, c, u]",0,31,2016,12,"[w, h, g, c, g, p, u, b, c, w, f, c, m, h, h, ...",1
1,1,"[f, f, f, c, d, k, g, p, u, h, b, x]",1,19,2017,6,"[b, u, h, g, c, u, d, b, d, d, v, v, c, d, b, ...",1
2,2,"[y, u, w, g, b, m, b, h, w, d, c, b, d, 0]",2,6,2017,1,"[w, d, y, b, m, b, p, d, 9, c, c, p, d, p, h, ...",1
3,3,"[m, g, d, h, 0, h, v, y, p, p, m, u, c]",3,10,2017,4,"[d, h, m, y, g, u, c, y, v, h, d, d, f, d, d, ...",1
4,4,"[k, m, j, g, u, y, h, k, p, p, g, g, g, m]",4,2,2017,1,"[u, u, h, k, h, k, d, k, m, d, u, d, y, c, u, ...",1


## 7- Remove Stop Words

In [369]:
STOPWORDS = set(stopwords.words("english"))
def remove_stop_words(text):
    return " ".join([word for word in text if word not in STOPWORDS])

columns = ["title", "author", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_stop_words(x))

## 8- Remove frequently words

In [370]:
word_counter_title = Counter()
word_counter_content = Counter()
def get_most_frequently_words(column_name, word_counter):
    for text in data_frame[column_name]:
        for word in text:
            word_counter[word] += 1
    return word_counter

# get the most common frequently words in [title] more than 1000
word_frequency_title = get_most_frequently_words("title", word_counter_title).most_common(10)

# get the most common frequently words in [content]
word_frequency_content = get_most_frequently_words("content", word_counter_content).most_common(10)

In [371]:
FREQUENT_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common(10))
FREQUENT_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common(10))

def remove_frequent_words(string_text, frequent_words):
    return " ".join([word for word in string_text if word not in frequent_words])

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_TITLE))

## 9- Remove rare words

In [372]:
RARE_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common()[:-10:-1])
RARE_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common()[:-10:-1])

def remove_rare_words(string_text, rare_words):
    return " ".join([word for word in string_text if word not in rare_words])

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: remove_rare_words(x, RARE_WORDS_TITLE))

## 10- Apply Stemming

In [373]:
porter_stemmer_title = PorterStemmer()
porter_stemmer_content = PorterStemmer()
def stemming_words(text, porter_stemmer):
    return " ".join([porter_stemmer.stem(word) for word in text])

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: stemming_words(x, porter_stemmer_title))

## 11- Apply POS and Lemmatization

In [374]:
lemmatizer_title = WordNetLemmatizer()
lemmatizer_content = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text, lemmatizer):
    # apply POS tagging
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0],  wordnet.NOUN)) for word, pos in pos_text])

columns = ["title", "content"]
for column in columns:
    data_frame[column] = data_frame[column].apply(lambda x: lemmatize_words(x, lemmatizer_title))

# Data Preprocessing
## 1- Dealing with categorical data (Nominal Data)

In [375]:
# get number of unique values in publication column
publication_unique_values = data_frame["publication"].unique()
print("Publication unique values = ", len(publication_unique_values))

Publication unique values =  1


In [376]:
# apply one hot encoding in publication column
data_frame = pd.get_dummies(data=data_frame, columns=["publication"])
data_frame.dtypes

Unnamed: 0                     int64
title                         object
author                        object
day                           string
year                          string
month                         string
content                       object
publication_new york times     uint8
dtype: object

In [377]:
# get number of unique values in author column
author_unique_values = Counter()
for text in data_frame["author"]:
    for author_name in re.split('and|,', text):
        author_unique_values[author_name] += 1

print("Author unique values = ", len(author_unique_values))

Author unique values =  87


In [378]:
def split_authors(text):
    return ','.join([word for word in re.split('and|,', text)])


data_frame["author"] = data_frame["author"].apply(lambda x: split_authors(x))
print(author_unique_values["benjamin mueller"], "other", author_unique_values["al baker"])

0 other 0


In [379]:
# apply mapping in author
mapping_author = {}
unique_authors = data_frame["author"].str.lower().unique()
for index, author in enumerate(unique_authors):
    mapping_author[author] = index

data_frame = data_frame.replace({
    "author": mapping_author
})
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_new york times
0,0,h u p u b c f w g h h c u,0,31,2016,12,w h g c g p u b c w f c m h h c w u b m d m m ...,1
1,1,f f f c d k g p u h b x,1,19,2017,6,b u h g c u d b d d v v c d b u p p p w d w c ...,1
2,2,y u w g b m b h w d c b d 0,2,6,2017,1,w d y b m b p d 9 c c p d p h u g v u y v y d ...,1
3,3,m g d h 0 h v y p p m u c,3,10,2017,4,d h m y g u c y v h d d f d d v u f f d m 0 c ...,1
4,4,k m j g u y h k p p g g g m,4,2,2017,1,u u h k h k d k m d u d y c u y m k g f p p c ...,1


In [87]:
# Get percentage of number of rows contain missing values
mask = data_frame.isnull().any(axis=1)
rows_with_missing_values = mask.sum() / len(data_frame)*100
print("Number of rows contain null values =", rows_with_missing_values, "%")

Number of rows contain null values = 2.0 %


In [88]:
# Drop rows contain null values
data_frame.dropna(inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         0.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

## 2- Handel columns data types

In [89]:
data_frame.dtypes

Unnamed: 0       int64
id               int64
title           object
publication     object
author          object
date            object
year           float64
month          float64
content         object
dtype: object

In [90]:
data_frame= data_frame.astype({"title": "string",
                               "publication": "string",
                               "author": "string",
                               "date": "datetime64",
                               "year": "int64",
                               "month": "int64",
                               "content": "string"})

data_frame.dtypes

Unnamed: 0              int64
id                      int64
title                  string
publication            string
author                 string
date           datetime64[ns]
year                    int64
month                   int64
content                string
dtype: object

## 3- Dealing with unnecessary columns

In [91]:
# Drop id column because it contain unnecessary unique values
data_frame.drop(columns=['id'], inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,date,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


In [92]:
# extract day from date column and rename column to day
data_frame['date'] = data_frame['date'].astype('string')
data_frame['date'] = data_frame['date'].str.replace("-", "/")
data_frame[["year", "month", "day"]] = data_frame["date"].str.split("/", expand = True)
data_frame['date'] = data_frame['day']
data_frame.drop(columns=["day"], inplace=True)
data_frame.rename(columns={"date": "day"}, inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


## 4- Dealing with categorical data (Nominal Data)

In [93]:
# get number of unique values in publication column
data_frame["publication"] = data_frame["publication"].str.lower()
publication_unique_values = data_frame["publication"].unique()
print("Publication unique values = ", len(publication_unique_values))

Publication unique values =  1


In [94]:
# get number of unique values in author column
data_frame["author"] = data_frame["author"].str.lower()
print("Author unique values = ", len(data_frame["author"].unique()))

Author unique values =  87


In [95]:
# apply one hot encoding in publication column
data_frame = pd.get_dummies(data=data_frame, columns=["publication"])
data_frame.dtypes

Unnamed: 0                     int64
title                         string
author                        string
day                           string
year                          string
month                         string
content                       string
publication_new york times     uint8
dtype: object

In [96]:
# apply mapping in author
mapping_author = {}
unique_authors = data_frame["author"].str.lower().unique()
for index, author in enumerate(unique_authors):
    mapping_author[author] = index

data_frame = data_frame.replace({
    "author" : mapping_author
})
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_new york times
0,0,House Republicans Fret About Winning Their Hea...,0,31,2016,12,WASHINGTON — Congressional Republicans have...,1
1,1,Rift Between Officers and Residents as Killing...,1,19,2017,6,"After the bullet shells get counted, the blood...",1
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",2,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri...",1
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",3,10,2017,4,"Death may be the great equalizer, but it isn’t...",1
4,4,Kim Jong-un Says North Korea Is Preparing to T...,4,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ...",1


# Text Preprocessing NLP Pipeline
## 1- Convert to lowercase

In [97]:
data_frame["title"] = data_frame["title"].str.lower()
data_frame["content"] = data_frame["content"].str.lower()
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_new york times
0,0,house republicans fret about winning their hea...,0,31,2016,12,washington — congressional republicans have...,1
1,1,rift between officers and residents as killing...,1,19,2017,6,"after the bullet shells get counted, the blood...",1
2,2,"tyrus wong, ‘bambi’ artist thwarted by racial ...",2,6,2017,1,"when walt disney’s “bambi” opened in 1942, cri...",1
3,3,"among deaths in 2016, a heavy toll in pop musi...",3,10,2017,4,"death may be the great equalizer, but it isn’t...",1
4,4,kim jong-un says north korea is preparing to t...,4,2,2017,1,"seoul, south korea — north korea’s leader, ...",1


## 2- Apply Tokenization
### a- Apply sentence tokenization

In [98]:
# def sentence_tokenizer(text):
#     return sent_tokenize(text)
#
# data_frame["content"] = data_frame["content"].apply(lambda x: sentence_tokenizer(x))
#
# data_frame["content"]

### b- Apply word tokenization

In [99]:
# def word_tokenizer(sentences):
#     tokenized_words = []
#     for x in sentences:
#         tokenized_words = tokenized_words + word_tokenize(x)
#     return tokenized_words
#
# data_frame["content"] = data_frame["content"].apply(lambda x: word_tokenizer(x))

## 3- Remove Punctuations and Special Characters

In [100]:
data_frame["title"] = data_frame['title'].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame["content"] = data_frame['content'].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood...
2,tyrus wong bambi artist thwarted by racial ...,when walt disney s bambi opened in 1942 cri...
3,among deaths in 2016 a heavy toll in pop musi...,death may be the great equalizer but it isn t...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader ...
...,...,...
95,corzine reaches 5 million settlement with reg...,after more than five years of investigations a...
96,sears agrees to sell craftsman to stanley blac...,the question from the analyst on thursday was ...
97,l i r r train that crashed was going over twi...,a long island rail road train that crashed in ...
98,record 2016 for u s auto industry long road b...,detroit unexpectedly strong sales of new ...


## 4- Remove unnecessary spaces and words

In [101]:
# remove unnecessary spaces
data_frame["title"] = data_frame['title'].str.replace('\s\s+', "", regex=True)

data_frame["content"] = data_frame['content'].str.replace('\s\s+', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have a ne...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood ...
2,tyrus wongbambiartist thwarted by racial biasd...,when walt disney s bambi opened in 1942 critic...
3,among deaths in 2016a heavy toll in pop musict...,death may be the great equalizer but it isn t ...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader kim sai...
...,...,...
95,corzine reaches5 million settlement with regul...,after more than five years of investigations a...
96,sears agrees to sell craftsman to stanley blac...,the question from the analyst on thursday was ...
97,l i r rtrain that crashed was going over twice...,a long island rail road train that crashed in ...
98,record 2016 for u sauto industry long road bac...,detroit unexpectedly strong sales of new vehic...


In [102]:
# remove unnecessary words
# remove publication from title
for publication in publication_unique_values:
    data_frame["title"] = data_frame['title'].str.replace("." + publication + "$", "", regex=True)

data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_new york times
0,0,house republicans fret about winning their hea...,0,31,2016,12,washington congressional republicans have a ne...,1
1,1,rift between officers and residents as killing...,1,19,2017,6,after the bullet shells get counted the blood ...,1
2,2,tyrus wongbambiartist thwarted by racial biasd...,2,6,2017,1,when walt disney s bambi opened in 1942 critic...,1
3,3,among deaths in 2016a heavy toll in pop musicthe,3,10,2017,4,death may be the great equalizer but it isn t ...,1
4,4,kim jong un says north korea is preparing to t...,4,2,2017,1,seoul south korea north korea s leader kim sai...,1


## 5- Remove Stop Words

In [103]:
STOPWORDS = set(stopwords.words("english"))
def remove_stop_words(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_stop_words(x))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_stop_words(x))

## 6- Remove frequently words

In [104]:
word_counter_title = Counter()
word_counter_content = Counter()
def get_most_frequently_words(column_name, word_counter):
    for text in data_frame[column_name]:
        for word in text.split():
            word_counter[word] += 1
    return word_counter

# get the most common frequently words in [title] more than 1000
word_frequency_title = get_most_frequently_words("title", word_counter_title).most_common(10)

# get the most common frequently words in [content]
word_frequency_content = get_most_frequently_words("content", word_counter_content).most_common(10)

In [105]:
FREQUENT_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common(10))

FREQUENT_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common(10))

def remove_frequent_words(string_text, frequent_words):
    return " ".join([word for word in string_text.split() if word not in frequent_words])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_TITLE))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_CONTENT))

## 7- Remove rare words

In [106]:
RARE_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common()[:-10:-1])

RARE_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common()[:-10:-1])

def remove_rare_words(string_text, rare_words):
    return " ".join([word for word in string_text.split() if word not in rare_words])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_rare_words(x, RARE_WORDS_TITLE))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_rare_words(x, RARE_WORDS_CONTENT))

## 8- Apply Stemming

In [107]:
porter_stemmer_title = PorterStemmer()
porter_stemmer_content = PorterStemmer()
def stemming_words(text, porter_stemmer):
    return " ".join([porter_stemmer.stem(word) for word in text.split()])

data_frame["title"] = data_frame["title"].apply(lambda x: stemming_words(x, porter_stemmer_title))

data_frame["content"] = data_frame["content"].apply(lambda x: stemming_words(x, porter_stemmer_content))

## 9- Apply POS and Lemmatization

In [108]:
lemmatizer_title = WordNetLemmatizer()
lemmatizer_content = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text, lemmatizer):
    # apply POS tagging
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0],  wordnet.NOUN)) for word, pos in pos_text])

data_frame["title"] = data_frame["title"].apply(lambda x: lemmatize_words(x, lemmatizer_title))

data_frame["content"] = data_frame["content"].apply(lambda x: lemmatize_words(x, lemmatizer_content))