# Imports

In [51]:
import pandas as pd
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re

# Data Reading

In [52]:
data_frame = pd.read_csv("../Dataset/articles.csv")
data_frame.head()

# Decrease Number of rows
# data_frame = data_frame.head(100)

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


# Data Cleaning
## 1- Handel Missing values
### a- Handel columns with missing values

In [53]:
# Get total number of nulls in each column
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0       0.000
id               0.000
title            0.000
publication      0.000
author          12.612
date             0.000
year             0.000
month            0.000
url            100.000
content          0.000
dtype: float64

In [54]:
# Remove url column because it's empty(100% null values)
data_frame.drop(columns=['url'], inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0      0.000
id              0.000
title           0.000
publication     0.000
author         12.612
date            0.000
year            0.000
month           0.000
content         0.000
dtype: float64

### b- Handel rows with missing values

In [55]:
# Get percentage of number of rows contain missing values
mask = data_frame.isnull().any(axis=1)
rows_with_missing_values = mask.sum() / len(data_frame)*100
print("Number of rows contain null values =", rows_with_missing_values, "%")

Number of rows contain null values = 12.612000000000002 %


In [56]:
# Drop rows contain null values
data_frame.dropna(inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         0.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

## 2- Handel columns data types

In [57]:
data_frame.dtypes

Unnamed: 0       int64
id               int64
title           object
publication     object
author          object
date            object
year           float64
month          float64
content         object
dtype: object

In [58]:
data_frame= data_frame.astype({"title": "string",
                               "publication": "string",
                               "author": "string",
                               "date": "datetime64",
                               "year": "int64",
                               "month": "int64",
                               "content": "string"})

data_frame.dtypes

Unnamed: 0              int64
id                      int64
title                  string
publication            string
author                 string
date           datetime64[ns]
year                    int64
month                   int64
content                string
dtype: object

# Text Preprocessing NLP Pipeline
## 1- Convert to lowercase

In [59]:
data_frame["title"] = data_frame["title"].str.lower()
data_frame["content"] = data_frame["content"].str.lower()
data_frame.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,content
0,0,17283,house republicans fret about winning their hea...,New York Times,Carl Hulse,2016-12-31,2016,12,washington — congressional republicans have...
1,1,17284,rift between officers and residents as killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,"after the bullet shells get counted, the blood..."
2,2,17285,"tyrus wong, ‘bambi’ artist thwarted by racial ...",New York Times,Margalit Fox,2017-01-06,2017,1,"when walt disney’s “bambi” opened in 1942, cri..."
3,3,17286,"among deaths in 2016, a heavy toll in pop musi...",New York Times,William McDonald,2017-04-10,2017,4,"death may be the great equalizer, but it isn’t..."
4,4,17287,kim jong-un says north korea is preparing to t...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,"seoul, south korea — north korea’s leader, ..."


## 2- Apply Tokenization
### a- Apply sentence tokenization

In [60]:
# def sentence_tokenizer(text):
#     return sent_tokenize(text)
#
# data_frame["content"] = data_frame["content"].apply(lambda x: sentence_tokenizer(x))
#
# data_frame["content"]

### b- Apply word tokenization

In [61]:
# def word_tokenizer(sentences):
#     tokenized_words = []
#     for x in sentences:
#         tokenized_words = tokenized_words + word_tokenize(x)
#     return tokenized_words
#
# data_frame["content"] = data_frame["content"].apply(lambda x: word_tokenizer(x))

## 3- Remove HTML tags

In [62]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', "", text)

data_frame["title"] = data_frame["title"].apply(lambda x: remove_html_tags(x))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_html_tags(x))



## 4- Remove Punctuations and Special Characters

In [63]:
data_frame["title"] = data_frame['title'].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame["content"] = data_frame['content'].str.replace('[^a-zA-Z0-9]', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood...
2,tyrus wong bambi artist thwarted by racial ...,when walt disney s bambi opened in 1942 cri...
3,among deaths in 2016 a heavy toll in pop musi...,death may be the great equalizer but it isn t...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader ...
...,...,...
49995,rex tillerson says climate change is real but,as chairman and ceo of exxonmobil rex tillers...
49996,the biggest intelligence questions raised by t...,i ve spent nearly 20 years looking at intellig...
49997,trump announces plan that does little to resol...,donald trump will not be taking necessary st...
49998,dozens of for profit colleges could soon close,dozens of colleges could be forced to close ...


## 4- Remove unnecessary spaces and words

In [64]:
# remove unnecessary spaces
data_frame["title"] = data_frame['title'].str.replace('\s\s+', "", regex=True)

data_frame["content"] = data_frame['content'].str.replace('\s\s+', " ", regex=True)

data_frame[['title', 'content']]

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have a ne...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood ...
2,tyrus wongbambiartist thwarted by racial biasd...,when walt disney s bambi opened in 1942 critic...
3,among deaths in 2016a heavy toll in pop musict...,death may be the great equalizer but it isn t ...
4,kim jong un says north korea is preparing to t...,seoul south korea north korea s leader kim sai...
...,...,...
49995,rex tillerson says climate change is realbut,as chairman and ceo of exxonmobil rex tillerso...
49996,the biggest intelligence questions raised by t...,i ve spent nearly 20 years looking at intellig...
49997,trump announces plan that does little to resol...,donald trump will not be taking necessary ste...
49998,dozens of for profit colleges could soon close,dozens of colleges could be forced to close in...


In [65]:
# remove unnecessary words
# remove publication from title
publication_unique_values = data_frame["publication"].unique()
for publication in publication_unique_values:
    data_frame["title"] = data_frame['title'].str.replace("." + publication + "$", "", regex=True)

data_frame.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,content
0,0,17283,house republicans fret about winning their hea...,New York Times,Carl Hulse,2016-12-31,2016,12,washington congressional republicans have a ne...
1,1,17284,rift between officers and residents as killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,after the bullet shells get counted the blood ...
2,2,17285,tyrus wongbambiartist thwarted by racial biasd...,New York Times,Margalit Fox,2017-01-06,2017,1,when walt disney s bambi opened in 1942 critic...
3,3,17286,among deaths in 2016a heavy toll in pop musict...,New York Times,William McDonald,2017-04-10,2017,4,death may be the great equalizer but it isn t ...
4,4,17287,kim jong un says north korea is preparing to t...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,seoul south korea north korea s leader kim sai...


## 5- Remove Stop Words

In [66]:
STOPWORDS = set(stopwords.words("english"))
def remove_stop_words(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_stop_words(x))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_stop_words(x))

## 6- Remove frequently words

In [67]:
word_counter_title = Counter()
word_counter_content = Counter()
def get_most_frequently_words(column_name, word_counter):
    for text in data_frame[column_name]:
        for word in text.split():
            word_counter[word] += 1
    return word_counter

# get the most common frequently words in [title] more than 1000
word_frequency_title = get_most_frequently_words("title", word_counter_title).most_common(10)

# get the most common frequently words in [content]
word_frequency_content = get_most_frequently_words("content", word_counter_content).most_common(10)

In [68]:
FREQUENT_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common(10))
FREQUENT_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common(10))

def remove_frequent_words(string_text, frequent_words):
    return " ".join([word for word in string_text.split() if word not in frequent_words])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_TITLE))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_frequent_words(x, FREQUENT_WORDS_CONTENT))

## 7- Remove rare words

In [None]:
RARE_WORDS_TITLE = set(word for (word, word_count) in get_most_frequently_words("title", word_counter_title).most_common()[:-10:-1])

RARE_WORDS_CONTENT = set(word for (word, word_count) in get_most_frequently_words("content", word_counter_content).most_common()[:-10:-1])

def remove_rare_words(string_text, rare_words):
    return " ".join([word for word in string_text.split() if word not in rare_words])

data_frame["title"] = data_frame["title"].apply(lambda x: remove_rare_words(x, RARE_WORDS_TITLE))

data_frame["content"] = data_frame["content"].apply(lambda x: remove_rare_words(x, RARE_WORDS_CONTENT))

## 8- Apply Stemming

In [None]:
porter_stemmer_title = PorterStemmer()
porter_stemmer_content = PorterStemmer()
def stemming_words(text, porter_stemmer):
    return " ".join([porter_stemmer.stem(word) for word in text.split()])

data_frame["title"] = data_frame["title"].apply(lambda x: stemming_words(x, porter_stemmer_title))

data_frame["content"] = data_frame["content"].apply(lambda x: stemming_words(x, porter_stemmer_content))

## 9- Apply POS and Lemmatization

In [None]:
lemmatizer_title = WordNetLemmatizer()
lemmatizer_content = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text, lemmatizer):
    # apply POS tagging
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0],  wordnet.NOUN)) for word, pos in pos_text])

data_frame["title"] = data_frame["title"].apply(lambda x: lemmatize_words(x, lemmatizer_title))

data_frame["content"] = data_frame["content"].apply(lambda x: lemmatize_words(x, lemmatizer_content))

In [None]:
# Drop id column because it contain unnecessary unique values
data_frame.drop(columns=['id'], inplace=True)
data_frame.head()

In [None]:
# extract day from date column and rename column to day
data_frame['date'] = data_frame['date'].astype('string')
data_frame['date'] = data_frame['date'].str.replace("-", "/")
data_frame[["year", "month", "day"]] = data_frame["date"].str.split("/", expand = True)
data_frame['date'] = data_frame['day']
data_frame.drop(columns=["day"], inplace=True)
data_frame.rename(columns={"date": "day"}, inplace=True)
data_frame.head()

## 2- Dealing with categorical data (Nominal Data)

In [None]:
# get number of unique values in publication column
publication_unique_values = data_frame["publication"].unique()
print("Publication unique values = ", len(publication_unique_values))

In [None]:
# apply one hot encoding in publication column
data_frame = pd.get_dummies(data=data_frame, columns=["publication"])
data_frame.dtypes

In [None]:
# get number of unique values in author column
author_unique_values = Counter()
for text in data_frame["author"]:
        for author_name in re.split('and|,',text):
            author_unique_values[author_name] += 1

print("Author unique values = ", len(author_unique_values))
print(author_unique_values)

In [None]:
def split_authors(text):
    return ','.join([word for word in re.split('and|,',text)])

data_frame["author"] = data_frame["author"].apply(lambda x: split_authors(x))

data_frame["author"]

print(author_unique_values["benjamin mueller"], "other" , author_unique_values["al baker"])

In [None]:
# apply mapping in author
mapping_author = {}
unique_authors = data_frame["author"].str.lower().unique()
for index, author in enumerate(unique_authors):
    mapping_author[author] = index

data_frame = data_frame.replace({
    "author" : mapping_author
})
data_frame.head()