# Exploration and Preprocessing of Texts

### Libraries

In [None]:
from collections import Counter
import html
import matplotlib.pyplot as plt
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import numpy as np
import os
from os.path import join
import pandas as pd
import pickle
import re
from scipy import sparse
from scipy.io import savemat, loadmat
from sklearn.feature_extraction.text import CountVectorizer
import string
import time
import unidecode
from wordcloud import WordCloud

### Directories

In [None]:
os.chdir('/Users/M/Google_Drive/Thesis/Topic-Modeling')

In [None]:
data_dir = 'Data/Technology-Data/processed/preprocessed/'
results_dir = 'Results/'

### Options

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
plt.close()
plt.interactive(True)

In [None]:
remove_stopwords = True
truncate_texts = False

### Read Data

In [None]:
news = pd.read_csv(data_dir + 'preprocessed.csv', sep=";", index_col=0)
news['year'] = news['year'].astype('int64')
news['text_orig'] = news['text'].copy()

In [None]:
news.tail(2)

### Read stop words
stops.txt is publicly available, https://github.com/adjidieng/DETM/blob/master/scripts/stops.txt

Because the texts are not transformed to lowercase and because some punctuation will be kept, the publicly available list of stop words is extended to account for some abbreviations and all the possible cased or uncased versions of stop words

In [None]:
with open('Data/Infos/stops.txt', 'r') as f:
    stops = f.read().split('\n')

In [None]:
stops.extend(['mr', 'mrs', 'guardian', 'e.g.', 'i.e.', 'ad_hoc'])

In [None]:
len(stops)

In [None]:
stops_upper = []
for s in stops:
    st_first = s[0].upper() + s[1:]
    stops_upper.append(st_first)
    st_all = s.upper()
    stops_upper.append(st_all)

In [None]:
stops = stops_upper + stops
stops.append('*')

In [None]:
len(stops)

### Unicode

In [None]:
print('Example: ', unidecode.unidecode("Crédit Suisse. François. $ £ ₤ ￡ € SFr")) #articles[10]

In [None]:
monetary_pattern = '|'.join(["£","￡","€","$","₤","SFr"])
news['text'] = news['text'].str.replace(monetary_pattern, ' * ')

In [None]:
def unidecode_text(text):
    text = [w for w in text.split()]   
    text = [unidecode.unidecode(w) for w in text]
    return " ".join(text)    

In [None]:
unidecode_text("Crédit Suisse. François. $8 £ ₤ ￡ € SFr")

In [None]:
news['text'] = news['text'].apply(unidecode_text)

### Contractions

In [None]:
apostrophes = "`" + "’" + "‘"
news['text'] = news['text'].str.replace('|'.join(apostrophes), "'")

In [None]:
news['text'] = news['text'].str.replace(r"won\'t", "will not", case=False)
news['text'] = news['text'].str.replace(r"can\'t", "can not", case=False)
news['text'] = news['text'].str.replace(r"n\'t", " not", case=False)
news['text'] = news['text'].str.replace(r"\'d", " would", case=False)
news['text'] = news['text'].str.replace(r"\'ll", " will", case=False)
news['text'] = news['text'].str.replace(r"\'ve", " have", case=False)
news['text'] = news['text'].str.replace(r"\'re", " are", case=False)
news['text'] = news['text'].str.replace(r"\'m", " am", case=False)
news['text'] = news['text'].str.replace(r"\'s", "", case=False)
news['text'] = news['text'].str.replace(r"s\'", "s", case=False)

### Punctuation

In [None]:
to_whitespace = '[{}]'.format(string.punctuation.replace('-','').replace('+','').replace('&','').replace('.','') + '“' + '”')
print(to_whitespace)

In [None]:
news['text'] = news['text'].str.replace(to_whitespace, ' * ')

For now, only remove full stops at the end and beginning of of words and replace multips full stops by one (e.g. A.I. in fastText)

In [None]:
news['text'] = news['text'].str.replace(r'\.+', '.', regex=True)

In [None]:
def remove_stops(text):
    text = [w for w in text.split()]   
    text = [re.sub(r'^\.', '* ', w) for w in text]
    text = [re.sub('\.$', ' *', w) for w in text] 
    return " ".join(text)

In [None]:
remove_stops('.hey the.re. how are you?')

In [None]:
news['text'] = news['text'].apply(remove_stops)

### Numbers
- e.g., 10,000bn, 1.5m, £2

In [None]:
re.match("[^a-zA-Z]","1.m")

In [None]:
any(i.isalpha() for i in '13')

In [None]:
def is_numeric(input_string):
    return re.match('^(£|￡|€|$)*(\d*)(\.|\,)*(\d+)(p)*(m|million|bn|billion|trillion|mph|km|mps|mbps|GB|GHz|TB|kg|g|ft|in)*$', input_string)

def remove_nonalpha(text):
    text = [w if not is_numeric(w) else '*' for w in text.split()]
    text = [w if any(i.isalpha() for i in w) else '*' for w in text]
    return " ".join(text)

In [None]:
remove_nonalpha('Hey 2.3bn people and 33m dollars 1trillion 3GB and 11.44kg')

In [None]:
news['text'] = news['text'].apply(remove_nonalpha)

### Explore & and . within tokens, e.g. A.I.

In [None]:
and_words = [w for w in news['text'].str.cat(sep=' ').split() if '&' in w and len(w)>1]
Counter(and_words).most_common(5)

In [None]:
period_words = [w for w in news['text'].str.cat(sep=' ').split() if '.' in w and len(w)>1]
Counter(period_words).most_common(5)

### Explore Mixed Strings (digit + alpha)
check out words that are a mixture of digits and alphanumeric characters, e.g. ftse500, mi5, g8, 3d, 4g, ps2, xbox360,...

In [None]:
def is_mixed(input_string):
    return any(char.isdigit() for char in input_string) and (any(char.isalpha() for char in input_string))
mixed_words = [w for w in news['text'].str.cat(sep=' ').split() if is_mixed(w)]
Counter(mixed_words).most_common(5)

### Only keep selected punctuation, a-Z and numbers within words

In [None]:
def contains_punct(input_string):
    return any(char in string.punctuation for char in input_string)
punct_words = [w for w in news['text'].str.cat(sep=' ').split() if contains_punct(w) and len(w)>1]
Counter(punct_words).most_common(5)

In [None]:
news['text'] = news['text'].str.replace('[^ \.\-\+&a-zA-Z0-9]',' * ', regex=True)
news['text'] = news['text'].apply(remove_stops)

### Explore Words containing -

The following words actually contain a lot of information, e.g. e-commerce, Wi-Fi, self-driving. Some might be found again by bigram analysis if hyphens were replaced by ' '. Not all of them are that common, however they will later be disregarded by using min_df.

In [None]:
def remove_hyphens(text):
    text = [re.sub('\-$', ' *', w) for w in text.split()]    
    text = [re.sub(r'^\-', '* ', w) for w in text]
    return " ".join(text)

In [None]:
news['text'] = news['text'].apply(remove_hyphens)

In [None]:
hyphen_words = [w for w in news['text'].str.cat(sep=' ').split() if "-" in w]
Counter(hyphen_words).most_common(5)

### Explore words containing +

In [None]:
plus_words = [w for w in news['text'].str.cat(sep=' ').split() if "+" in w]
Counter(plus_words).most_common(5)

### Bigrams and Trigrams:

To have more control over the constructed bigrams and trigrams, bigrams are first constructed and marked as such. In a next iteration, trigrams are constructed based on previously constructed bigrams. To avoid the construction of collocations of words that did not originally occur together, preceding pre-processing steps replaced removed elements by an arbitrarily chosen symbol, which could then be excluded from collocation construction.

#### Bigrams

Apply a word filter based on the list of stopwords that has already been created:

In [None]:
bigram_finder = BigramCollocationFinder.from_documents(list(news['text'].str.split()))
bigram_finder.apply_freq_filter(50)
bigram_finder.apply_word_filter(lambda w: w in stops)

In [None]:
bi_freq_scores = bigram_finder.score_ngrams(BigramAssocMeasures.raw_freq)
bi_likelihood_scores = bigram_finder.score_ngrams(BigramAssocMeasures.likelihood_ratio)
bi_chisq_scores = bigram_finder.score_ngrams(BigramAssocMeasures.chi_sq)
bi_pmi_scores = bigram_finder.score_ngrams(BigramAssocMeasures.pmi)
bi_studentt_scores = bigram_finder.score_ngrams(BigramAssocMeasures.student_t)

In [None]:
bigrams_freq = [' '.join(entry[0]) for entry in bi_freq_scores]
bigrams_like = [' '.join(entry[0]) for entry in bi_likelihood_scores]
bigrams_chisq = [' '.join(entry[0]) for entry in bi_chisq_scores]
bigrams_pmi = [' '.join(entry[0]) for entry in bi_pmi_scores]
bigrams_studentt = [' '.join(entry[0]) for entry in bi_studentt_scores]

In [None]:
bigrams_overview = pd.DataFrame(list(zip(bigrams_freq, bigrams_like, bigrams_chisq, bigrams_pmi, bigrams_studentt)), 
                                columns =['Frequency', 'Likelihood', 'Chi-Square', 'PMI', 'Student-t']) 

In [None]:
bigrams_overview.head()

In [None]:
bigrams = [b for b in bigrams_pmi[:2001] if b != 'km Eco'][:2000]

In [None]:
[b for b in bigrams if 'United' in b]

In [None]:
[b for b in bigrams if 'Zuckerberg' in b]

In [None]:
[b for b in bigrams if 'iPod' in b]

In [None]:
[b for b in bigrams if 'Eco' in b]

In [None]:
bigrams_replacements  = ['_' + w.replace(' ','_') + '_' for w in bigrams]
bigrams_replacer = dict(zip(bigrams, bigrams_replacements))
bigrams_replacer = {" " + k + " ": " " + v + " " for k, v in bigrams_replacer.items()}

In [None]:
print('Example: "social media" ->', bigrams_replacer[' social media '])

In [None]:
def replace_bigrams(text):
    for i, j in bigrams_replacer.items(): # note: order matters
        text = text.replace(i, j)
    return text

In [None]:
news['text'] = ' ' + news['text'].astype('str') + ' '
news['text'] = news['text'].apply(replace_bigrams)


#### Trigrams

In [None]:
trigram_finder = BigramCollocationFinder.from_documents(list(news['text'].str.split()))
trigram_finder.apply_freq_filter(50)
trigram_finder.apply_word_filter(lambda w: w in stops)

In [None]:
tri_freq_scores = trigram_finder.score_ngrams(BigramAssocMeasures.raw_freq)
tri_likelihood_scores = trigram_finder.score_ngrams(BigramAssocMeasures.likelihood_ratio)
tri_chisq_scores = trigram_finder.score_ngrams(BigramAssocMeasures.chi_sq)
tri_pmi_scores = trigram_finder.score_ngrams(BigramAssocMeasures.pmi)
tri_studentt_scores = trigram_finder.score_ngrams(BigramAssocMeasures.student_t)

In [None]:
trigrams_freq = [' '.join(entry[0]) for entry in tri_freq_scores if ' '.join(entry[0]).count('_') == 3]
trigrams_like = [' '.join(entry[0]) for entry in tri_likelihood_scores if ' '.join(entry[0]).count('_') == 3]
trigrams_chisq = [' '.join(entry[0]) for entry in tri_chisq_scores if ' '.join(entry[0]).count('_') == 3]
trigrams_pmi = [' '.join(entry[0]) for entry in tri_pmi_scores if ' '.join(entry[0]).count('_') == 3]
trigrams_studentt = [' '.join(entry[0]) for entry in tri_studentt_scores if ' '.join(entry[0]).count('_') == 3]

In [None]:
trigrams_overview = pd.DataFrame(list(zip(trigrams_freq, trigrams_like, trigrams_chisq, trigrams_pmi, trigrams_studentt)), 
                                columns =['Frequency', 'Likelihood', 'Chi-Square', 'PMI', 'Student-t']) 

In [None]:
trigrams_overview.head(5)

In [None]:
trigrams = [" " + t + " " for t in trigrams_pmi[:101] if 'km _Eco_rating_' not in t][:100]

In [None]:
trigrams_replacements  = [' ' + w.replace(' ','') + ' ' for w in trigrams]
trigrams_replacer = dict(zip(trigrams, trigrams_replacements))

In [None]:
def replace_trigrams(text):
    for i, j in trigrams_replacer.items(): # order matters (replacement order based on PMI)
        text = text.replace(i, j)
    return text

In [None]:
news['text'] = news['text'].apply(replace_trigrams)

In [None]:
news['text'] = news['text'].str.replace('_ ',' ')
news['text'] = news['text'].str.replace(' _',' ')

### Stop words

In [None]:
def remove_stopwords(text):
    text = [w for w in text.split() if w not in stops]
    return " ".join(text)

if remove_stopwords:
    news['text'] = news['text'].apply(remove_stopwords)

### Explore Lengths of Texts

In [None]:
news['textLength'] = news['text'].str.split().str.len()

In [None]:
news['textLength'].hist(bins=100);

In [None]:
news['textLength'].describe()

In [None]:
news['textLength'].mode()

In [None]:
news.loc[news['textLength']==news['textLength'].min(),['textLength','text']]

### Remove very short documents

In [None]:
news.loc[news['textLength']<=5,'text']

In [None]:
news = news.loc[news['textLength']>=10]

In [None]:
news.shape

### Truncate Texts (if True)

In [None]:
news.head()

In [None]:
def truncate_text(text, length):
    tokens = text.split()
    tokens = tokens[:length]
    tokens = ' '.join(tokens)
    return tokens

In [None]:
if truncate_texts:
    news['text'] = news['text'].apply(truncate_text, length=500)

### Explore Timestamps

In [None]:
news['year'].value_counts()

In [None]:
news = news.sort_values(by = 'date')

### Save as raw texts.txt and as news.csv

In [None]:
csv_file = os.path.join(data_dir,'news.csv')
txt_file = os.path.join(data_dir,'texts.txt')

In [None]:
news[['id','date','year','text']].to_csv(csv_file, sep=";")

with open(txt_file, 'w') as f:
    for a in news['text']:
        f.write(a + '\n')