# This notebook contains the text processing algorithms 

In [None]:
#import stemming algorithm
%run czech_stemmer.py light

**Firstly, we import all necessary libraries**

In [None]:
import time
import pandas as pd
import nltk
import string
import re
import numpy as np
import gensim
import warnings
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem.snowball import SnowballStemmer
from stop_words import get_stop_words
import simplemma
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from tensorflow.keras.layers import TextVectorization

from tensorflow.keras.layers import Embedding
from tensorflow import data 
from tensorflow import keras
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import gensim
import warnings
#ignore all Futurewarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## In this section we implement functions related to basic test preprocessing e.g. tokenization, stop words removal etc. 

In [None]:
'''
This is a list of words that shall never be lowercased beacause of their ambiguous meaning.
'''
global untouchable_words
untouchable_words = ['US']

In [None]:
def remove_punctation(news_articles: pd.Series) -> pd.Series:
    '''
    Removes punctation from all news articles in the given series (one-dimensional ndarray).
    
    Parameters
    ----------
    news_articles: pandas.Series
            news_articles: Series (one-dimensional ndarray) of news articles.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of punctation.
    '''
    all_punctuation = string.punctuation
    all_punctuation += 'ʺ-„–‘’“”—✔️©'
    return news_articles.str.translate(str.maketrans( all_punctuation,' '*len(all_punctuation)))
  

In [None]:

def remove_urls(news_articles: pd.Series) -> pd.Series:
    '''
    Removes urls from all news articles in the given series (one-dimensional ndarray).
    
    Parameters
    ----------
    pandas.Series
            Series (one-dimensional ndarray) of news articles.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of urls.
    '''
    return news_articles.replace(r'http\S+', '', regex=True).replace(r'www\S+', ' ', regex=True)

In [None]:
def remove_numbers(news_articles: pd.Series) -> pd.Series:
    '''
    Removes numerical characters from all news articles in the given series (one-dimensional ndarray).
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of news articles.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of numbers.
    '''
    return news_articles.str.replace('\d+', '')
    

In [None]:
def remove_special_characters(news_articles: pd.Series) -> pd.Series:
    '''
    Removes all unnecessary characters (numerals, punctation, urls) from all news articles in the given series (one-dimensional ndarray).
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of news articles.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of all unnecessary characters.
    '''
    return remove_numbers(remove_punctation(remove_urls(news_articles)))

In [None]:
def truncate_articles(news_artricles:pd.Series, length:int)-> pd.Series:
    '''
    Truncates articles to desired length.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of news articles.
    length: int
            Maximal allow number of words in articles.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of all unnecessary characters.
    '''
    
    return news_artricles.apply(lambda row: row[:length])

In [None]:

def create_ngrams(row:str,gram_size:int) ->list:
    '''
    Lambda function which is being applied to concrete news article.
    
    Parameters
    ----------
    row: str
            News article in form of string.
    gram_size: int
            Maximal size of n-gram allowed in tokenization.
            
    Returns
    -------
    list
        Article converted into list of token of maximal size.
    '''
    tokens = [t.lower() if t not in untouchable_words else t for t in word_tokenize(row)]
    #result = list(ngrams(tokens, gram_size))
    return [list(x) for idx,x in enumerate(list(ngrams(tokens, gram_size)))]
    # for idx,x in enumerate(result):
    #     result[idx] = list(x)
    # return result

def tokenize(news_articles: pd.Series,gram_size: int) -> pd.Series:
    '''
    Tokenizes all texts into n-grams with maximal size given in parameter using lambda function.
    
    Parameters
    ----------
    news_articles: pandas.Series
           news_articles: Series (one-dimensional ndarray) of news articles.
    gram_size: int
            Maximal size of n-gram allowed in tokenization.
            
    Returns
    -------
    pandas.Series
        Series of articles tokenized into n-grams with given size.
    '''
    return news_articles.apply(lambda row: " ".join(row)).apply(lambda row:create_ngrams(row,gram_size))

In [None]:
def remove_stop_words(news_articles: pd.Series,language:str ='en') ->pd.Series:
    '''
    Removes stop words from pre-tokenized news articles in given language. 
    If the entered language is not supported an exception is raised. Finally in removes empty tokens from articles.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of already tokenized news articles.
    language: str
            Language used in articles. Available options are 'en' or 'cs'.
            
    Rises
    -----
    ValueError
            If the entered language is not supported.
            
    Returns
    -------
    pandas.Series
        Series of articles stripped of stop words.
    '''

    if language=='en':
        stop_words = [word for word in nltk.corpus.stopwords.words('english')]
    elif language=='cs':
        stop_words = get_stop_words('czech')
    else:
        raise ValueError("You have entered the wrong language!") 

    #add empty string to set of stpowords
    stop_words.append('')
    stop_words = set(stop_words)
    
    return news_articles.apply(lambda row: [w.lower() if w not in untouchable_words else w for w  in row.split() if w.lower() not in stop_words])

    

In [None]:
def stem(news_articles: pd.Series,language:str='en', embedding:bool = False) -> pd.Series:
    '''
    Stems the tokens in the articles by specified language. Uses SnowballStemmer of english language and light stemming for the czech language. 
    If the entered language is not supported an exception is raised.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of already tokenized news articles.
    language: string
            Language used in articles. Available options are 'en' or 'cs'.
    embedding: bool
            Tells us if result data will go into feature extraction model or word embedding. Default value is False.
            
    Rises
    -----
    ValueError
            If the provided language in not supported.
            
    Returns
    -------
    pandas.Series
        Series of stemmed tokenized articles.
    '''

    if language not in ['en','cs']:
         raise ValueError("You have entered the wrong language!") 
        
    if embedding:
        if language=='en':
            snow = SnowballStemmer("english")
            return news_articles.apply(lambda row: [snow.stem(word) for word in row])
        elif language=='cs':
            %run czech_stemmer.py light
            return news_articles.apply(lambda row: [cz_stem(word) for word in row])
    
    if language=='en':
        snow = SnowballStemmer("english")
        return news_articles.apply(lambda row: [[snow.stem(word) for word in ngram ] for ngram in row])
    elif language=='cs':
        return news_articles.apply(lambda row: [[cz_stem(word) for word in ngram ] for ngram in row])
    

In [None]:
def lemmatize(news_articles: pd.Series,language:str='en', embedding:bool=False) -> pd.Series:
    '''
    Lemmatize the tokens in the articles by specified language. Uses simplemma lemmatizator for both languages. 
    If the entered language is not supported an exception is raised.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of already tokenized news articles.
    language: string
            Language used in articles. Available options are 'en' or 'cs'.
    embedding: bool
            Tells us if result data will go into feature extraction model or word embedding. Default value is False.
    
    Rises
    -----
    ValueError
            If the provided language in not supported.
            
    Returns
    -------
    pandas.Series
        Series of lemmatized tokenized articles.
    
    '''
    if language not in ['en','cs']:
         raise ValueError("You have entered the wrong language!") 

    if embedding:
        return news_articles.apply(lambda row: [simplemma.lemmatize(word,lang=language) for word in row ])
    
    return news_articles.apply(lambda row: [[simplemma.lemmatize(word,lang=language) for word in ngram ] for ngram in row])



## In this section we implement functions that performs the feature extraction (bag-of-words and tf-idf) from given preprocessed dataset and related ones

In [None]:

def covert_tokens_to_strings(news_articles:pd.Series) -> list:
    '''
    Convert tokenized articles into list of strings.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Tokenized articles.
            
    Returns
    -------
    list
        list of strings cumulated from tokenized articles.
    '''
    return [' '.join(map(str, l)) for l in news_articles]

In [None]:
def create_BOW(news_articles: pd.Series,ngram_size_range:tuple=(1,1)) ->tuple[np.ndarray, sk.feature_extraction.text.CountVectorizer]:
    '''
    Creates bag-of-words feature extraction model from preprocessed articles. 
    Then it transforms input data into form processable by ML model and fits the vectorizer. The input data must be training data.

    Parameters
    ------
    
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of preprocessed news articles.
    ngram_size_rng: tuple
            Range of possible sizes of n-grams.
            
    Returns
    -------
    numpy.ndarray
        Document-term matrix of train data in form processable to ML models.
    sklearn.feature_extraction.text.CountVectorizer
        Trained vectorizer with vocabulary dictionary.
    '''  
    count_vectorizer = CountVectorizer(lowercase=False,ngram_range=ngram_size_range,max_features=10000)
    
    X = count_vectorizer.fit_transform(covert_tokens_to_strings(news_articles))
    return X.toarray(),count_vectorizer
    

In [None]:
def create_tf_idf(news_articles: pd.Series,ngram_range:tuple=(1,1)) -> tuple[np.ndarray, sk.feature_extraction.text.TfidfVectorizer]:
   '''
    Creates tfidf feature extraction model from preprocessed articles. 
    Then it transforms input data into a form processable by ML model and fits the vectorizer. The input data must be training data.
    
    Parameters
    ------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of preprocessed news articles.
    ngram_size_rng: tuple
            ngram_size_rng: Range of possible sizes of ngrams.
            
    Returns
    -------
    numpy.ndarray
        Document-term matrix in form processable to ML models.
    sklearn.feature_extraction.text.CountVectorizer
        Trained vectorizer with vocabulary dictionary.
    '''  
   tf_idf_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=ngram_range,max_features=10000)
    
   X = tf_idf_vectorizer.fit_transform(covert_tokens_to_strings(news_articles))
   return X.toarray(),tf_idf_vectorizer

In [None]:
def apply_vectorizer(news_articles: pd.Series,vectorizer:sk.feature_extraction.text.CountVectorizer|sk.feature_extraction.text.TfidfVectorizer)->np.array:
    '''
    Applies vectorizer with vocabulary dictionary on test data. Transforms them into suitable form for ML models. 
    
    Parameters
    ------
    news_articles: pandas.Series
            Series (one-dimensional ndarray) of preprocessed news articles - test data.
    vectorizer: sk.feature_extraction.text.CountVectorizer or sk.feature_extraction.text.TfidfVectorizer
            Pre-trained vectorizer used in this project i.e. tf-idf and bag-of-words.

    Returns
    -------
    numpy.ndarray 
        Document-term matrix for test data 
    '''
    return vectorizer.transform(covert_tokens_to_strings(news_articles)).toarray()

## In this section we implement functions related to working with word embeddings in both languages.

### In this section we prepare functions related to word2vec embedding

In [None]:
def create_vectorizer(news_articles:pd.Series,max_tokens:int=10000) -> tf.keras.layers.TextVectorization:
    '''
    Creates vocabulary of used words from dataset using keras TextVectorization.
    
    Parameters
    ------
    news_articles: pd.Series
            Series (one-dimensional ndarray) of news articles.
    max_tokens: int
            Maximal size of vocabulary of created TextVectorization. Defalut value is set to 10 000.

    Returns
    -------
    tf.keras.layers.TextVectorization
        Preprocessing layer that contains necessary vocabulary of the dataset.
    '''
    vectorizer = TextVectorization(max_tokens=max_tokens+2)
    #concat all articles to one string
    column_string = [' '.join(map(str, l)) for l in news_articles]
    
    text_ds = tf.data.Dataset.from_tensor_slices(column_string).batch(128)
    
    vectorizer.adapt(text_ds)
    
    return vectorizer
    

In [None]:
def create_word2vec(news_articles : pd.Series, embedding_dim:int) -> gensim.models.word2vec:
    '''
    Creates and pretrains word2vec model on provided data. Number of epochs is set to 20 for better results. 
    All CPU cores are used during training process.
    
    Parameters
    ----------
    news_article: pandas.Series
            Series (one-dimensional ndarray) of news articles.

    embedding_dim:int
            Dimension of embedding vectors specified for given dataset.
    Returns
    -------
    list
        List of vectors with the same lenght as input Series. Each vector is computed as mean of all embedding vectors in certain article.  
    
    '''
    
    word2vec = Word2Vec(news_articles, vector_size=embedding_dim ,epochs=20,workers=-1)
    return word2vec

In [None]:
def prepare_W2V_matrix(w2v_model: gensim.models.word2vec.Word2Vec,vectorizer:tf.keras.layers.TextVectorization,embedding_dimension:int=100) -> np.ndarray:
    '''
    Create an embedding matrix using embedding trained on train data and vecctorizer using the samed dataset.

    Parameters
    ----------
    w2v_model: gensim.models.word2vec.Word2Vec
            Word2Vec embedding model trained on training dataset that contains vectors for all words
    vectorizer: tf.keras.layers.TextVectorization
            A preprocessing layer which contains vocabulary for concrete dataset.
    embedding_dimension: int
            Dimension of embedding vectors that corresponds to words. Default value is set to 100, 300 is for czech dataset.

    Returns
    -------
    numpy.ndarray
        Embedding 2D matrix that is used in embedding layer.
    '''
    vectorized_data = dict(zip(vectorizer.get_vocabulary(), range(len(vectorizer.get_vocabulary()))))
    embedding_matrix = np.zeros((len(vectorizer.get_vocabulary())+2,embedding_dimension))
    
    for i,tokenized in enumerate(vectorized_data.items()):

        if tokenized[0] not in set(w2v_model.wv.index_to_key):
            continue
        embedding_vector = w2v_model.wv.get_vector(tokenized[0])
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix




def create_W2V_layer(train_articles:pd.Series,vectorizer:tf.keras.layers.TextVectorization,input_len:int,embedding_dimension:int=100) ->keras.layers.Embedding:
    '''
    Creates W2V embedding layer that will be used in neural networks to transform.

    Parameters
    ----------
    train_articles: pandas.Series
            Is the training part of input dataset based on which the vectors are going to be created.
    vectorizer: tensorflow.keras.layers.TextVectorization
            Vocabulary of words extracted from training part of the preprocessed dataset.
    input_len: int
            Maximal length of the article that the whole dataset was truncated to
    embedding_dimension: int
            Dimension of embedding vectors that corresponds to words. Default value is set to 100, 300 is for czech dataset.

    Returns
    -------
    keras.layers.core.embedding.Embedding
        Embedding layer that is going to be a part of neural network model. 
    
    '''
    w2v_model = create_word2vec(train_articles,embedding_dimension)
    embedding_matrix = prepare_W2V_matrix(w2v_model,vectorizer)
    
    embedding_layer = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_dimension,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False)
    
    return embedding_layer

### In this section we implement functions that prepares pre-trained GloVe word embeddings for experiments

**First, we prepare pretrained GloVe 100D vectors in English from https://github.com/stanfordnlp/GloVe.**

In [None]:
global EN_EMBEDDING_DIM
EN_EMBEDDING_DIM = 100
EN_GL_EMBEDDING_IDX = {}

def create_en_glove_index() -> dict:
    '''
    Reads pretrained GloVe word embedding from file, creates dictionary of words and their corresponding embedding vectors.
    
    Returns
    -------
    dictionary
        Dictionary indexing vectors to concrete words. 
    '''
    glove_embeddings_index = {}
    with open('glove.6B.100d.txt', encoding='UTF-8') as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            glove_embeddings_index[word] = coefs
    f.close()
    return glove_embeddings_index
EN_GL_EMBEDDING_IDX = create_en_glove_index()
%store EN_GL_EMBEDDING_IDX

**Now, we prepare pretrained Glove 300D vectors in Czech from https://github.com/Svobikl/cz_corpus.**

In [None]:
global CZ_EMBEDDING_DIM

CZ_EMBEDDING_DIM = 300
CS_GL_EMBEDDING_IDX={}
def create_cz_glove_index() -> dict:
    '''
    Reads pretrained GloVe word embedding from specified file, creates dictionary of words and their corresponding embedding vectors.
    
    Returns
    -------
    dictionary
        Dictionary indexing vectors to concrete words. 
    '''
    glove_embeddings_index = {}
    with open('vectors_cz_glove_dim300_25.txt', encoding='UTF-8') as f:
        for line in f:
            if re.match(".*\s[a-z]",line):
                continue
        
            word, coefs = line.split(maxsplit=1)
            if any(c.isdigit()for c in word):
                continue
            if word.find("_") != -1:
                word = word.replace("_", " ")
            
            coefs = np.fromstring(coefs, "f", sep=" ")
            glove_embeddings_index[word] = coefs
    f.close()
    return glove_embeddings_index

CS_GL_EMBEDDING_IDX = create_cz_glove_index()
%store CS_GL_EMBEDDING_IDX

In [None]:

def prepare_glove_matrix(embedding_dict:dict,vectorizer:tf.keras.layers.TextVectorization,embedding_dim:int=100) -> np.ndarray:
    '''
    Creates an embedding matrix using pretrined embedding and vecctorizer trained on train data.

    Parameters
    ----------
    embedding_dict: dictionary
            Dictionary of words with corresonding vectors
    vectorizer: tensorflow.keras.layers.TextVectorization
            A preprocessing layer which contains vocabulary for concrete dataset.
    embedding_dim: int
            Dimension of embedding vectors that corresponds to words. Default value is set to 100, 300 is for czech dataset.
    
    Returns
    -------
    numpy.ndarray
        Embedding 2D matrix that is used in embedding layer
    '''
    
    vectorized_data = dict(zip(vectorizer.get_vocabulary(), range(len(vectorizer.get_vocabulary()))))
    embedding_matrix = np.zeros((len(vectorizer.get_vocabulary()),embedding_dim))
    for i, tokenized in enumerate(vectorized_data.items()):
        embedding_vector = embedding_dict.get(tokenized[0])
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix


def create_GloVeWE_layer(vectorizer:tf.keras.layers.TextVectorization,embedding_dict:dict,input_length:int, embedding_dim:int=100) ->keras.layers.Embedding:

    '''
    Creates embedding layer based on pretrained GloVe embedding obtained from existing file.
    
    Parameters
    ----------
    vectorizer: tensorflow.keras.layers.TextVectorization
            Vocabulary of words extracted from training part of the preprocessed dataset.
    embedding_dict: dictionary
            Dictionary where every word from pretrained embedding has corresponing vector.
    input_len: int
            Maximal length of the article that the whole dataset was truncated to.
    embedding_dim: int
            Dimension of embedding vectors that corresponds to words. Default value is set to 100, 300 is for czech dataset.

    Returns
    -------
    keras.layers.core.embedding.Embedding
        Embedding layer that is going to be a part of neural network model. 
    '''

    
    embedding_matrix = prepare_glove_matrix(embedding_dict,vectorizer,embedding_dim)
    
    embedding_layer = Embedding(
        len(vectorizer.get_vocabulary()),
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False)
    return embedding_layer

In [None]:
def vectorize_articles(news_articles:pd.Series,vectorizer:tf.keras.layers.TextVectorization) -> np.array:
    '''
    Convert given dataset in form suitable for vectorizer and apply already-made vectorizer - converts words to corresponding number.
    
    Parameters
    ----------
    news_articles: pandas.Series
            Subset of data that is going to be vectorized.
    vectorizer: tensorflow.keras.layers.TextVectorization
            Vocabulary of words extracted from training part of the preprocessed dataset.
    
    Returns
    -------
    numpy.ndarray
        Articles with words converted to corresponding numbers.
    '''
    news_articles = news_articles.apply(lambda row: ' '.join(row))
    return np.array(vectorizer(np.array([[s] for s in news_articles])))

In [None]:
def mean_vector_text(news_article:list, word_vector_list:gensim.models.keyedvectors.KeyedVectors|dict,embedding_dim:int) -> np.ndarray:

    '''
    Computes mean of embedding vectors for given article from dataset. It is a help function for the following one.
    
    Parameters
    ----------
    article: list
            news_article: Single news_article.
    word_vector_list: gensim.models.keyedvectors.KeyedVectors or dict
            Dictionary that stores embedding vectors for corresponing words
    embedding_dim:int
        Dimension of embedding vectors.

    Returns
    -------
    numpy.ndarray
        Vector of means of all embedding vectors that corresponds with words in given article.
     
    '''
    
    article_vc_representation = np.zeros([len(news_article), embedding_dim])

    for i,w in enumerate(news_article):
        if w in word_vector_list:
            #did not load correctly
            if word_vector_list[w].shape == (0,):
                continue
            article_vc_representation[i] = word_vector_list[w]
    return np.mean(article_vc_representation,axis=0)



def transform_to_vec(news_articles:pd.Series,word_vector_list:gensim.models.keyedvectors.KeyedVectors|dict, embedding_dim:int) -> list:

    '''
    Transforms series of news_articles into form suitable for ML models by means of their embedding vectors.
    
    Parameters
    ----------
    news_article: pandas.Series
            Series (one-dimensional ndarray) of news articles.
    word_vector_list: gensim.models.keyedvectors.KeyedVectors or dict
            Dictionary that stores embedding vectors for corresponing words.
    embedding_dim: int
            Dimension of embedding vectors of given dataset.
    
    Returns
    -------
    list
        List of vectors with the same lenght as input Series. Each vector is computed as mean of all embedding vectors in certain article.  
    '''
    return [mean_vector_text(article, word_vector_list, embedding_dim) for article in news_articles]

In [None]:
def idenity_function(news_articles: pd.Series,embedding_dim:int)->tuple[pd.Series,int]:
    '''
    Dummy function that simplify distinguishing between using pretrained word-embedding and trained one.

    Parameters
    ----------
    news_article: pandas.Series
            Series (one-dimensional ndarray) of news articles.
    embedding_dim:int
            Dimension of embedding vectors of given dataset.
    
    Returns
    -------
    pd.Series
        News articles without any change.
    int
        Zero for the same output as in other function.
    '''
    return news_articles, 0

In [None]:
def idenity_function_text_tn(news_articles: pd.Series,lang:str,embedding:bool)->pd.Series:
    '''
     Dummy function that simplify using word-embedding.
     
     Parameters
    ----------
    news_article: pandas.Series
            Series (one-dimensional ndarray) of news articles.
    embedding_dim:int
            Dimension of embedding vectors of given dataset.
    
    Returns
    -------
    pd.Series
        News articles without any change.
    '''
    return news_articles