# Final pre-processing 
- Includes the final pre-processing steps as required by D-ETM
- Ensures that the data for the word clouds in 07_Visualisation.ipynb has gone through exactly the same pre-processing steps (min_df,...)

## Import Libraries

In [None]:
from collections import Counter
import html
import matplotlib.pyplot as plt
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
import numpy as np
import os
from os.path import join
import pandas as pd
import pickle
from random import shuffle
import re
from scipy import sparse
from scipy.io import savemat, loadmat
from sklearn.feature_extraction.text import CountVectorizer
import string
import time
import unidecode
from wordcloud import WordCloud

## Set Options

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
plt.close()
plt.interactive(True)

In [None]:
group_years = True
use_subset = False

## Directories

In [None]:
os.chdir('/Users/M/Google_Drive/Thesis/Topic-Modeling')

In [None]:
data_dir = 'Data/Technology-Data/processed/preprocessed/'
data_dir_final = 'Data/Technology-Data/processed/final/'
results_dir = 'Results/'

In [None]:
if use_subset:
    csv_file = data_dir + 'news_sub.csv'
    txt_file = data_dir + 'texts_sub.txt'
else:
    csv_file = data_dir + 'news.csv'
    txt_file = data_dir + 'texts.txt'

In [None]:
data_dir

## Build the Vocabulary & Save Data

### Read Data

In [None]:
news = pd.read_csv(csv_file, sep=";", index_col=0)

In [None]:
if group_years:
        news.loc[news['year']<2002, 'year_gr'] = '2000-2001'
        news.loc[(news['year']>=2002) & (news['year']<2004), 'year_gr'] = '2002-2003'
        news.loc[(news['year']>=2004) & (news['year']<2006), 'year_gr'] = '2004-2005'
        news.loc[(news['year']>=2006) & (news['year']<2008), 'year_gr'] = '2006-2007'
        news.loc[(news['year']>=2008) & (news['year']<2010), 'year_gr'] = '2008-2009'
        news.loc[(news['year']>=2010) & (news['year']<2012), 'year_gr'] = '2010-2011'
        news.loc[(news['year']>=2012) & (news['year']<2014), 'year_gr'] = '2012-2013'
        news.loc[(news['year']>=2014) & (news['year']<2016), 'year_gr'] = '2014-2015'
        news.loc[(news['year']>=2016) & (news['year']<2018), 'year_gr'] = '2016-2017'
        news.loc[(news['year']>=2018) & (news['year']<2020), 'year_gr'] = '2018-2019'
else:
    print("Don't group years.")

In [None]:
if group_years:
    print(Counter(news['year_gr']))
else:
    print(Counter(news['year']))

In [None]:
if group_years:
    timestamps = news['year_gr'].tolist()
else:
    timestamps = news['year'].tolist()
articles = news['text'].tolist()

## Create Input Data for D-ETM

This section of the notebook follows the pre-processing steps by Adji Dieng
(https://github.com/adjidieng/DETM/blob/master/scripts/data_undebates.py) as required for D-ETM. However, a few adjustments were made.

### Create mapping dictionaries for timestamps

In [None]:
all_times = sorted(set(timestamps))
time2id = dict([(t, i) for i, t in enumerate(all_times)])
id2time = dict([(i, t) for i, t in enumerate(all_times)])
time_list = [id2time[i] for i in range(len(all_times))]

In [None]:
time2id

### Split into train, test, valid and create vocabulary

In [None]:
num_articles = len(articles)
print('Number of articles: ', num_articles)

trSize = int(np.floor(0.85 * num_articles))
tsSize = int(np.floor(0.10 * num_articles))
vaSize = int(num_articles - trSize - tsSize)

print('Defined training set size: %d, test set size: %d, validation set size: %d' % (trSize, tsSize, vaSize))

In [None]:
idx_permute = np.random.permutation(num_articles).astype(int)

### Construct Vocabulary for Modeling Using Context-Insensitive Embeddings

#### Maximum / minimum article frequency:
- proportion of articles: 0.7 (e.g., ignore words occurring in > 70 % of the articles)
- absolute count of articles: 50 (e.g., only include words occurring in at least 50 documents)

In [None]:
0.001*len([a for a in articles if len(a)>0])

In [None]:
max_df = 0.7
min_df = 50
print(' max df: %d%%, \n min_df: %d'%(max_df*100, min_df))

#### Apply CountVectorizer
- Count article frequency of words
- Convert collection of articles to a matrix of token counts
- Stopwords will not be removed as that has already been done

In [None]:
cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None, lowercase=False, tokenizer=lambda x: x.split(' '))
cvz = cvectorizer.fit_transform(articles).sign()
print('Shape of CVZ matrix (articles, words): ', cvz.shape)
num_articles = cvz.shape[0]
vocab_full_size = cvz.shape[1]
cvz_array = cvz.toarray()

##### Get the number of occurrences across all articles for each token (vocab_full_size)

In [None]:
sum_counts_matrix = cvz.sum(axis=0)

##### The size of the vocabulary is the number of tokens left after correcting for max_df, min_df

In [None]:
print('Initial vocabulary size: {}'.format(vocab_full_size))

##### Save the counts for each word in a numpy array instead of a matrix of shape (1, vocab_full_size):

In [None]:
sum_counts = np.zeros(vocab_full_size, dtype=int)
for v in range(vocab_full_size):
    sum_counts[v] = sum_counts_matrix[0, v]

#### Set up word2id and id2word dictionaries:

In [None]:
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])

#### Create Vocabulary & Sort elements in vocabulary

- Sort by number of occurrences in ascending order and return array of indices
- Set up vocabulary of tokens in ascending order of word occurrence

In [None]:
idx_sort = np.argsort(sum_counts)

In [None]:
vocab = [id2word[idx_sort[idx_token]] for idx_token in range(vocab_full_size)]

In [None]:
print('The 5 most common words in our vocabulary are: ', vocab[-5:])
print('They occur the following number of times:')
print(sorted(sum_counts)[-5:])

In [None]:
used_trigrams = []
used_bigrams = []
for w in vocab:
    if w.count('_') == 2:
        used_trigrams.append(w)
    if w.count('_') == 1:
        used_bigrams.append(w)

In [None]:
len(used_bigrams)

In [None]:
len(used_trigrams)

#### Create dictionary and inverse dictionary

In [None]:
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])

In [None]:
id2word[word2id[vocab[-1]]]

##### for every article that is randomly selected to be part of the training set, split the article and add the word to the vocabulary if it is in word2id (means that words not in train but in the other sets will not be included)

In [None]:
idx_permute = np.random.permutation(num_articles).astype(int)
vocab = list(set([w for idx_article in range(trSize) for w in articles[idx_permute[idx_article]].split() if w in word2id]))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
vocab_train_size = len(vocab)
if (vocab_full_size - vocab_train_size) > 0:
    print(vocab_full_size - vocab_train_size, ' words have not been included in the vocabulary as they do not occur in the training data.')
    print('Vocabulary after removing words not in train: {}'.format(len(vocab)))

In [None]:
articles_full = [[word2id[w] for w in articles[idx_d].split() if w in word2id] for idx_d in range(len(articles))]
timestamps_full = [time2id[timestamps[idx_d]] for idx_d in range(len(articles))]
articles_tr = [[word2id[w] for w in articles[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
timestamps_tr = [time2id[timestamps[idx_permute[idx_d]]] for idx_d in range(trSize)]
articles_ts = [[word2id[w] for w in articles[idx_permute[idx_d + trSize]].split() if w in word2id] for idx_d in range(tsSize)]
timestamps_ts = [time2id[timestamps[idx_permute[idx_d + trSize]]] for idx_d in range(tsSize)]
articles_va = [[word2id[w] for w in articles[idx_permute[idx_d + trSize + tsSize]].split() if w in word2id] for idx_d in range(vaSize)]
timestamps_va = [time2id[timestamps[idx_permute[idx_d + trSize + tsSize]]] for idx_d in range(vaSize)]

print('number of articles (full dataset): {} \n... equal to len(articles) and len(timestamps_full)? - {}'.format(
    len(articles_full), len(articles_full) == len(articles) == len(timestamps_full)))
print('number of articles (train): {} \n... equal to len(articles_tr) and len(timestamps_tr)? - {}'.format(
    len(articles_tr), len(articles_tr) == len(articles_tr) == len(timestamps_tr)))
print('number of articles (test): {} \n... equal to len(articles_ts) and len(timestamps_ts)? - {}'.format(
    len(articles_ts), len(articles_ts) == len(articles_ts) == len(timestamps_ts)))
print('number of articles (valid): {} \n... equal to len(articles_va) and len(timestamps_va)? - {}'.format(
    len(articles_va), len(articles_va) == len(articles_va) == len(timestamps_va)))

### Split test set in 2 halves
This is done to perform the document completion task

In [None]:
articles_ts_h1 = [[w for i, w in enumerate(article) if i <= len(article) / 2.0 - 1] for article in articles_ts]
articles_ts_h2 = [[w for i, w in enumerate(article) if i > len(article) / 2.0 - 1] for article in articles_ts]

### Getting lists of words and article_indices

In [None]:
print('Creating lists of words...')
def create_list_words(in_articles):
    return [word for article in in_articles for word in article]

words_full = create_list_words(articles_full)
words_tr = create_list_words(articles_tr)
words_ts = create_list_words(articles_ts)
words_ts_h1 = create_list_words(articles_ts_h1)
words_ts_h2 = create_list_words(articles_ts_h2)
words_va = create_list_words(articles_va)

print('len(words_full): ', len(words_full))
print('len(words_tr): ', len(words_tr))
print('len(words_ts): ', len(words_ts))
print('len(words_ts_h1): ', len(words_ts_h1))
print('len(words_ts_h2): ', len(words_ts_h2))
print('len(words_va): ', len(words_va))

### Get article indices

In [None]:
def create_article_indices(in_articles):
    # for every word in the article add the index of the article in the corresponding set
    aux = [[j for i in range(len(article))] for j, article in enumerate(in_articles)]
    return [int(x) for y in aux for x in y]

article_indices_full = create_article_indices(articles_full)
article_indices_tr = create_article_indices(articles_tr)
article_indices_ts = create_article_indices(articles_ts)
article_indices_ts_h1 = create_article_indices(articles_ts_h1)
article_indices_ts_h2 = create_article_indices(articles_ts_h2)
article_indices_va = create_article_indices(articles_va)

print('len(article_indices_full): {} \n...should be len(words_full): {}'.format(len(article_indices_full), len(words_full)))
print('len(article_indices_tr): {} \n...should be len(words_tr): {}'.format(len(article_indices_tr), len(words_tr)))
print('\n')
print('len(np.unique(article_indices_full)): {} \n...should be {}'.format(len(np.unique(article_indices_full)), len(articles_full)))
print('len(np.unique(article_indices_tr)): {} \n...should be {}'.format(len(np.unique(article_indices_tr)), len(articles_tr)))
print('len(np.unique(article_indices_ts)): {} \n...should be {}'.format(len(np.unique(article_indices_ts)), len(articles_ts)))
print('len(np.unique(article_indices_ts_h1)): {} \n...should be {}'.format(len(np.unique(article_indices_ts_h1)), len(articles_ts_h1)))
print('len(np.unique(article_indices_ts_h2)): {} \n...should be {}'.format(len(np.unique(article_indices_ts_h2)), len(articles_ts_h2)))
print('len(np.unique(article_indices_va)): {} \n...should be {}'.format(len(np.unique(article_indices_va)), len(articles_va)))

### Number of articles in each set

In [None]:
n_articles_full = len(articles_full)
n_articles_tr = len(articles_tr)
n_articles_ts = len(articles_ts)
n_articles_ts_h1 = len(articles_ts_h1)
n_articles_ts_h2 = len(articles_ts_h2)
n_articles_va = len(articles_va)

### Create BoW representations

In [None]:
def create_bow(article_indices, words, n_articles, vocab_size):
    return sparse.coo_matrix(([1] * len(article_indices), (article_indices, words)), shape=(n_articles, vocab_size)).tocsr()

bow_full = create_bow(article_indices_full, words_full, n_articles_full, len(vocab))
bow_tr = create_bow(article_indices_tr, words_tr, n_articles_tr, len(vocab))
bow_ts = create_bow(article_indices_ts, words_ts, n_articles_ts, len(vocab))
bow_ts_h1 = create_bow(article_indices_ts_h1, words_ts_h1, n_articles_ts_h1, len(vocab))
bow_ts_h2 = create_bow(article_indices_ts_h2, words_ts_h2, n_articles_ts_h2, len(vocab))
bow_va = create_bow(article_indices_va, words_va, n_articles_va, len(vocab))

In [None]:
print(bow_full.shape, bow_tr.shape,bow_ts.shape,bow_va.shape)

In [None]:
if use_subset:
    data_dir_final = data_dir_final + 'subset/'

if group_years:
    data_dir_final = data_dir_final + 'grouped_years/'

data_dir_final = os.path.join(data_dir_final, 'min_df_{}'.format(min_df))
print('Final data directory:', data_dir_final)

In [None]:
if not os.path.isdir(data_dir_final):
    os.system('mkdir -p ' + data_dir_final)

### Save the vocabulary and timestamps

In [None]:
with open(os.path.join(data_dir_final,'vocab.txt'), "w") as f:
    for v in vocab:
        f.write(v + '\n')

with open(os.path.join(data_dir_final, 'vocab.pkl'), 'wb') as f:
    pickle.dump(vocab, f)

with open(os.path.join(data_dir_final,'timestamps.txt'), "w") as f:
    for t in time_list:
        f.write(str(t) + '\n')

with open(os.path.join(data_dir_final,'timestamps.pkl'), 'wb') as f:
    pickle.dump(time_list, f)

### Save timestamps corresponding to BoW document representations

In [None]:
savemat(os.path.join(data_dir_final, 'bow_full_timestamps.mat'), {'timestamps': timestamps_full}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_tr_timestamps.mat'), {'timestamps': timestamps_tr}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_ts_timestamps.mat'), {'timestamps': timestamps_ts}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_va_timestamps.mat'), {'timestamps': timestamps_va}, do_compression=True)

### Split bow into token/value pairs & save

In [None]:
def split_bow(bow_in, n_articles):
    indices = [[w for w in bow_in[doc, :].indices] for doc in range(n_articles)]
    counts = [[c for c in bow_in[doc, :].data] for doc in range(n_articles)]
    return indices, counts

In [None]:
bow_full_tokens, bow_full_counts = split_bow(bow_full, n_articles_full)
savemat(os.path.join(data_dir_final, 'bow_full_tokens.mat'), {'tokens': bow_full_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_full_counts.mat'), {'counts': bow_full_counts}, do_compression=True)

bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_articles_tr)
savemat(os.path.join(data_dir_final, 'bow_tr_tokens.mat'), {'tokens': bow_tr_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_tr_counts.mat'), {'counts': bow_tr_counts}, do_compression=True)

bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_articles_ts)
savemat(os.path.join(data_dir_final, 'bow_ts_tokens.mat'), {'tokens': bow_ts_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_ts_counts.mat'), {'counts': bow_ts_counts}, do_compression=True)

bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_articles_ts_h1)
savemat(os.path.join(data_dir_final, 'bow_ts_h1_tokens.mat'), {'tokens': bow_ts_h1_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_ts_h1_counts.mat'), {'counts': bow_ts_h1_counts}, do_compression=True)

bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_articles_ts_h2)
savemat(os.path.join(data_dir_final, 'bow_ts_h2_tokens.mat'), {'tokens': bow_ts_h2_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_ts_h2_counts.mat'), {'counts': bow_ts_h2_counts}, do_compression=True)

bow_va_tokens, bow_va_counts = split_bow(bow_va, n_articles_va)
savemat(os.path.join(data_dir_final, 'bow_va_tokens.mat'), {'tokens': bow_va_tokens}, do_compression=True)
savemat(os.path.join(data_dir_final, 'bow_va_counts.mat'), {'counts': bow_va_counts}, do_compression=True)

## Create Data for Word Clouds

In [None]:
concat_texts = news.groupby('year_gr')['text'].apply(','.join).reset_index()
concat_texts.rename(columns={'year_gr':'time','text':'text_orig'},inplace=True)

In [None]:
def remove_non_vocab(text):
    text = " ".join([id2word[word2id[w]] for w in text.split() if w in word2id.keys()])
    return text

concat_texts['words'] = concat_texts['text_orig'].apply(remove_non_vocab)
concat_texts['number_words'] = concat_texts['words'].str.split().str.len()

In [None]:
concat_texts

In [None]:
if not os.path.exists(data_dir + 'pseudotext_wordClouds'):
    os.makedirs(data_dir + 'pseudotext_wordClouds')
    
concat_texts[['time','words']].to_csv(data_dir + 'pseudotext_wordClouds.csv')