# Imports

In [30]:
import pandas as pd

# Data Preprocessing
## 1- Handel Missing values

In [31]:
data_frame = pd.read_csv("../Dataset/articles.csv")
data_frame.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


### a- Handel columns with missing values

In [32]:
# Get total number of nulls in each column
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0       0.000
id               0.000
title            0.000
publication      0.000
author          12.612
date             0.000
year             0.000
month            0.000
url            100.000
content          0.000
dtype: float64

In [33]:
# Remove url column because it's empty(100% null values)
data_frame.drop(columns=['url'], inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0      0.000
id              0.000
title           0.000
publication     0.000
author         12.612
date            0.000
year            0.000
month           0.000
content         0.000
dtype: float64

### b- Handel rows with missing values

In [34]:
# Get percentage of number of rows contain missing values
mask = data_frame.isnull().any(axis=1)
rows_with_missing_values = mask.sum() / len(data_frame)*100
print("Number of rows contain null values =", rows_with_missing_values, "%")

Number of rows contain null values = 12.612000000000002 %


In [35]:
# Drop rows contain null values
data_frame.dropna(inplace=True)
data_frame.isnull().sum() / len(data_frame) * 100

Unnamed: 0     0.0
id             0.0
title          0.0
publication    0.0
author         0.0
date           0.0
year           0.0
month          0.0
content        0.0
dtype: float64

## 2- Handel columns data types

In [36]:
data_frame.dtypes

Unnamed: 0       int64
id               int64
title           object
publication     object
author          object
date            object
year           float64
month          float64
content         object
dtype: object

In [37]:
data_frame= data_frame.astype({"title": "string",
                               "publication": "string",
                               "author": "string",
                               "date": "datetime64",
                               "year": "int64",
                               "month": "int64",
                               "content": "string"})

data_frame.dtypes

Unnamed: 0              int64
id                      int64
title                  string
publication            string
author                 string
date           datetime64[ns]
year                    int64
month                   int64
content                string
dtype: object

## 3- Dealing with unnecessary columns

In [38]:
# Drop id column because it contain unnecessary unique values
data_frame.drop(columns=['id'], inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,date,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


In [39]:
# extract day from date column and rename column to day
data_frame['date'] = data_frame['date'].astype('string')
data_frame['date'] = data_frame['date'].str.replace("-", "/")
data_frame[["year", "month", "day"]] = data_frame["date"].str.split("/", expand = True)
data_frame['date'] = data_frame['day']
data_frame.drop(columns=["day"], inplace=True)
data_frame.rename(columns={"date": "day"}, inplace=True)
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,publication,author,day,year,month,content
0,0,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,31,2016,12,WASHINGTON — Congressional Republicans have...
1,1,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,19,2017,6,"After the bullet shells get counted, the blood..."
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,10,2017,4,"Death may be the great equalizer, but it isn’t..."
4,4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ..."


## 4- Dealing with categorical data (Nominal Data)

In [40]:
# get number of unique values in publication column
data_frame["publication"] = data_frame["publication"].str.lower()
print("Publication unique values = ", len(data_frame["publication"].unique()))

Publication unique values =  5


In [41]:
# get number of unique values in author column
data_frame["author"] = data_frame["author"].str.lower()
print("Author unique values = ", len(data_frame["author"].unique()))

Author unique values =  3600


In [42]:
# apply one hot encoding in publication column
data_frame = pd.get_dummies(data=data_frame, columns=["publication"])
data_frame.dtypes

Unnamed: 0                       int64
title                           string
author                          string
day                             string
year                            string
month                           string
content                         string
publication_atlantic             uint8
publication_breitbart            uint8
publication_business insider     uint8
publication_cnn                  uint8
publication_new york times       uint8
dtype: object

In [43]:
# apply mapping in author
mapping_author = {}
unique_authors = data_frame["author"].str.lower().unique()
for index, author in enumerate(unique_authors):
    mapping_author[author] = index

data_frame = data_frame.replace({
    "author" : mapping_author
})
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_atlantic,publication_breitbart,publication_business insider,publication_cnn,publication_new york times
0,0,House Republicans Fret About Winning Their Hea...,0,31,2016,12,WASHINGTON — Congressional Republicans have...,0,0,0,0,1
1,1,Rift Between Officers and Residents as Killing...,1,19,2017,6,"After the bullet shells get counted, the blood...",0,0,0,0,1
2,2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",2,6,2017,1,"When Walt Disney’s “Bambi” opened in 1942, cri...",0,0,0,0,1
3,3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",3,10,2017,4,"Death may be the great equalizer, but it isn’t...",0,0,0,0,1
4,4,Kim Jong-un Says North Korea Is Preparing to T...,4,2,2017,1,"SEOUL, South Korea — North Korea’s leader, ...",0,0,0,0,1


# Text Preprocessing NLP Pipeline

## 1- convert to lowercase

In [44]:
data_frame["title"] = data_frame["title"].str.lower()
data_frame["content"] = data_frame["content"].str.lower()
data_frame.head()

Unnamed: 0.1,Unnamed: 0,title,author,day,year,month,content,publication_atlantic,publication_breitbart,publication_business insider,publication_cnn,publication_new york times
0,0,house republicans fret about winning their hea...,0,31,2016,12,washington — congressional republicans have...,0,0,0,0,1
1,1,rift between officers and residents as killing...,1,19,2017,6,"after the bullet shells get counted, the blood...",0,0,0,0,1
2,2,"tyrus wong, ‘bambi’ artist thwarted by racial ...",2,6,2017,1,"when walt disney’s “bambi” opened in 1942, cri...",0,0,0,0,1
3,3,"among deaths in 2016, a heavy toll in pop musi...",3,10,2017,4,"death may be the great equalizer, but it isn’t...",0,0,0,0,1
4,4,kim jong-un says north korea is preparing to t...,4,2,2017,1,"seoul, south korea — north korea’s leader, ...",0,0,0,0,1


## 2- Remove Punctuations