# Data Exploration and General Preprocessing

## Libraries and Settings

In [None]:
import ast
from bs4 import BeautifulSoup
import collections
from collections import Counter
import html
import matplotlib.pyplot as plt
import numpy as np
import os
from os.path import join, exists
import pandas as pd
import regex as re

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', None)
plt.close()
plt.interactive(True)

## Directories

In [None]:
os.chdir('/Users/M/Google_Drive/Thesis/Topic-Modeling')
data_dir = 'Data/Technology-Data/'

## Load Data

In [None]:
csv_filename = os.path.join(data_dir,'raw','raw.csv')
news = pd.read_csv(csv_filename, sep=";", index_col=0)

## Explore & Pre-process

### Missing values

In [None]:
print(news.shape)

In [None]:
print(news.isna().sum().sort_values(ascending=False))

### Duplicates

In [None]:
print('Every article has a unique ID?',len(news['id'].unique()) == len(news))

In [None]:
print('Duplicates: ', sum(news.drop(['filename','id'], axis=1).duplicated()))

### Publication

In [None]:
print(news['publication'].value_counts())

### Date Variables / Exploration & New Variables

There are multiple date variables available.
Except for newspaperEditionDate, they initially even include a timestamp.
The number of articles across the years in the selected time horizon fluctuates a lot and needs to be explored.

In [None]:
date_cols = ['webPublicationDate', 'newspaperEditionDate', 'firstPublicationDate']

In [None]:
news[date_cols] = news[date_cols].apply(pd.to_datetime)
for col in date_cols:
    news[col] = pd.to_datetime(news[col].dt.date)

In [None]:
print(news[date_cols].isna().sum())

#### webPublicationDate

In [None]:
news['webPublicationDate'].dt.year.value_counts().sort_index()

There are a few days where larger numbers of articles have been published:

In [None]:
news['webPublicationDate'].value_counts().head(5)

In [None]:
news['webPublicationDate'].hist();

Most articles were published on a Thursday (Monday=0,Sunday=6):

In [None]:
news['webPublicationDate'].dt.dayofweek.value_counts().sort_index()

In comparison to newspaperEditionDate: most articles occur on the same or the following day. But there are a few much larger differences:

In [None]:
news['from_WebPub_until_Newspaper'] = (news['newspaperEditionDate'] - news['webPublicationDate']).dt.days

In [None]:
news['from_WebPub_until_Newspaper'].value_counts().head(5)

In [None]:
news['from_WebPub_until_Newspaper'].value_counts().tail(5)

#### firstPublicationDate:
Missing values in the firstPublicationDate can not be tracked down to just one publication format:

In [None]:
print('% NA:', news['firstPublicationDate'].isna().sum() / len(news))

In [None]:
print(news.loc[~news['firstPublicationDate'].isna(), 'publication'].value_counts())

firstPublicationDate has apparently only been recorded for articles published later than 2011:

In [None]:
print(news['firstPublicationDate'].dt.year.value_counts().sort_index())

In [None]:
print('There are %d cases where the firstPublicationDate has date values later than the webPublicationDate: \n' % (
    len(news[news['firstPublicationDate'] > news['webPublicationDate']])))

#### Earliest Date:

In most cases, the earliest date is webPublicationDate (which also has the least missing values).

In [None]:
news['min_date'] = news[['webPublicationDate', 'newspaperEditionDate', 'firstPublicationDate']].min(axis=1, skipna=True)
news['min_date_type'] = news[['webPublicationDate', 'newspaperEditionDate', 'firstPublicationDate']].idxmin(axis=1,skipna=True)
print(news['min_date_type'].value_counts())

In [None]:
news['min_date'].hist()

#### Final Timestamp
Use the year of the earliest date.

In [None]:
news['year'] = news['min_date'].dt.year

In [None]:
news['year'].value_counts().sort_index()

### charCount & wordCount

In [None]:
news['charCount'].value_counts()[:5]

In [None]:
if len(news[(news['charCount'] == 0) & ~(news['bodyText'].isna())]) == 0:
    print('There are no documents where charCount is 0 but bodyText contains a value. Remove those documents')
    news = news[news['charCount']!=0]

In [None]:
news['charCount'].hist(bins=100);

In [None]:
news.loc[news['charCount']<10, ['wordcount','headline','bodyText']]

In [None]:
news.loc[news['wordcount']<5, 'bodyText'].tail()

In [None]:
news = news[news['wordcount']>=10]

### Text Exploration & Creation of Text Variable

#### bodyText

Mark bodyText of actually empty articles as np.nan and remove duplicates

In [None]:
news['bodyText'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [None]:
print(news['bodyText'].isna().sum())

In [None]:
news = news.sort_values(by='year')
news.drop_duplicates(subset='bodyText', keep='first', inplace=True)

#### headline

In [None]:
news['headline'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [None]:
print(news['headline'].isna().sum())

There are a few headlines that occur very often. They will be removed.

In [None]:
news['headline'].value_counts()[:10]

In [None]:
news.loc[news['headline'].value_counts()[news['headline']].values > 1, 'headline'] = np.nan

Specifically exclude articles that were contributions from readers (Feedback):

In [None]:
news.loc[news['headline'].str.contains("week's letters|Feedback:|Letters and blogs", na=False), 'headline'].value_counts().head(5)

In [None]:
news = news.loc[~news['headline'].str.contains("week's letters|Feedback:|Letters and blogs", na=False)]

#### trailText

trailText is the lead paragraph of an article. Some are empty and some contain recurring texts that do not seem relevant for the content of an article.

In [None]:
news['trailText'] = news['trailText'].str.replace('<[^<]+?>', '')
news['trailText'].replace(r'^\s*$', np.nan, regex=True, inplace=True)

reoccurring trailTexts (do not seem to be relevant for the content)

In [None]:
print(news['trailText'].value_counts()[:3])

In [None]:
news['trailText'].fillna(' ', inplace=True)
news.loc[news['trailText'].isin(news['trailText'].value_counts()[news['trailText'].value_counts()>1].index),'trailText'] = np.nan
news.loc[news['trailText']== ' ', 'trailText'] = np.nan

In [None]:
news.trailText.isna().sum()

Analyse common beginnings of the lead paragraph (arbitrary choice of the first 3 words).
Usually, it is the names of authors. They will be removed later.

In [None]:
trailtext_starters = [str(trailer.split()[:3]) for trailer in news['trailText'].dropna()]

In [None]:
trailtext_counter = collections.Counter(trailtext_starters)

In [None]:
trailtext_counter.most_common(5)

#### New Variable: text

Analyse where trailText is already included in bodyText. Most of the time this happens at the beginning of bodyText

In [None]:
print(news[['bodyText','trailText']].dropna().apply(lambda x: x.trailText in x.bodyText, axis=1).value_counts())

In [None]:
print(news[['bodyText','trailText']].dropna().apply(lambda x: x.bodyText.startswith(x.trailText), axis=1).value_counts())

In [None]:
news.update(news[['bodyText','trailText']].fillna(''))
news['trail_in_body'] = news[['bodyText','trailText']].apply(lambda x: x.bodyText.startswith(str(x.trailText)), axis=1)

In [None]:
news.loc[news['trail_in_body'], 'text'] = news.loc[news['trail_in_body'], 'bodyText']
news.loc[~news['trail_in_body'], 'text'] = news.loc[~news['trail_in_body'], 'trailText'] + '. ' + news.loc[~news['trail_in_body'], 'bodyText']

Analyse where the headline is already placed at the beginning of the current column "text":

In [None]:
news['header_in_text'] = news[['headline','text']].apply(lambda x: str(x.text).startswith(str(x.headline)), axis=1)
print(news['header_in_text'].value_counts())

In [None]:
news.loc[news['header_in_text'],['headline','text']].tail(2)

Add the headline to the text:

In [None]:
news.loc[news['header_in_text'], 'text'] = news.loc[news['header_in_text'], 'text']
news.loc[~news['header_in_text'], 'text'] = news.loc[~news['header_in_text'], 'headline'] + '. ' + news.loc[~news['header_in_text'], 'text']

#### Remove names of authors

In [None]:
authors = list(news['author'].dropna())
authors = [ast.literal_eval(art_authors) for art_authors in authors]
author_set = [author for author_list in authors for author in author_list]
author_set = list(set(author_set))

In [None]:
author_set[:5]

In [None]:
len(author_set)

In [None]:
news['text'] = news['text'].str.replace('|'.join(author_set), '')

#### HTML unescape

In [None]:
print('Example: ', html.unescape("Cr&eacute;dit Suisse // relationship with AT&amp;T, the telecommunications group // £ $"))

In [None]:
news['text'] = news['text'].astype('str').apply(html.unescape)

#### Remove hyperlink strings

In [None]:
news['text'] = news['text'].str.split()
hyperlink_recognizers = ['@', '.net', '.org', 'www', 'http', '.com', 'co.uk', 'gov.uk']

In [None]:
news.reset_index(inplace=True)
for idx, text in news['text'].iteritems():
    text = ['guardian_link_placeholder' if (any(x in w for x in hyperlink_recognizers) and "guardian" in w) else w for w in text]
    text = [w for w in text if not any(x in w for x in hyperlink_recognizers)]
    news.loc[idx,'text'] = " ".join(text)

#### Remove common The Guardian sentences 
They usually occur at the end of an article. Remove them if they occur more than once.

In [None]:
guardian_sentences = []
guardian_sentence_identifiers = ['guardian', 'technology newsbucket', 'technology newsbucket', 'tech weekly', 'free delicious service', 'for more information, go to ']
for a in news['text']:
    for s in re.split('[.?·•:]', a):
        if any(identifier in s.lower() for identifier in guardian_sentence_identifiers):
            guardian_sentences.append(s)

In [None]:
Counter(guardian_sentences).most_common(5)

In [None]:
Counter(guardian_sentences).most_common()[-3:]

In [None]:
def remove_element(lst, threshold): 
    counter = Counter(lst) 
    temp_lst = [] 
    for c in counter: 
        if counter[c] < threshold: 
            temp_lst.append(c) 
    res_lst = [] 
    for l in lst: 
        if l not in temp_lst: 
            res_lst.append(l)   
    return(res_lst) 

guardian_sentences = remove_element(guardian_sentences, 2)
guardian_sentences_set = list(set(guardian_sentences))
guardian_sentences_set.sort(key=len, reverse=True)
news['text'] = news['text'].str.replace('|'.join([re.escape(s) for s in guardian_sentences_set]), ' ')

#### Lengths of Texts

In [None]:
news['textLength'] = news['text'].str.split().str.len()

In [None]:
news['textLength'].plot(title='textLength')

## Save

In [None]:
save_path = os.path.join(data_dir,'processed','preprocessed')
filename = os.path.join(save_path,'preprocessed.csv')
os.makedirs(save_path, exist_ok=True)

In [None]:
news.rename(columns={'min_date': 'date'}, inplace=True)
news.reset_index(drop=True, inplace=True)
news[['id','year','date','author','text']].to_csv(filename, sep=';')