# 2. Data Manipulation
## 2.1. Tokenization

In [9]:
%run "2.0. DataManipulation_Functions.ipynb"

In [10]:
import os
import spacy

In [11]:
# Loading the spaCy trained pipeline for Russian https://spacy.io/models/ru
nlp = spacy.load("ru_core_news_sm")

In [12]:
folder = 'data'

## Useful functions

In [13]:
def tokenization_pipe(df):
    """
    Function the return a list of lists of tokens. It ignores
    punctation and stop words

    Args: a cleaned dataframe e.g. df_scraped
    Returns: a list of list of lists of tokens. Each nested list correspond to a row of the dataframe df
    """
    lemma_text_list = []
    for doc in nlp.pipe(df["body"]):
        lemma_text_list.append([token.lemma_ for token in doc if (token.is_stop==False and token.is_punct==False)])
    return lemma_text_list


In [14]:
def tokenization_pipe_title(df):
    """
    Function the return a list of lists of tokens. It ignores
    punctation and stop words

    Args: a cleaned dataframe e.g. df_scraped
    Returns: a list of list of lists of tokens. Each nested list correspond to a row of the dataframe df
    """
    lemma_text_list = []
    for doc in nlp.pipe(df["title"]):
        lemma_text_list.append([token.lemma_ for token in doc if (token.is_stop==False and token.is_punct==False)])
    return lemma_text_list

In [15]:
def get_df_scraped_tokenized(df):
    """ add two columns to the df_scraped for title and body news with token. Before that, it creates a list and 
    then assign this list to the df.
    args
        df:: df_scraped, the scraped dataframe
    return
        df:: df_scraped with tokens in body and title. Create two additional columns.
    
    ***WARNING***
    Running this function takes lot of time e.g. 8 hours
    """
    # Running time: 485 minutes and 38.5 sec
    list_token_body = tokenization_pipe(df) 
    df['body_token_final'] = list_token_body
    # Running time: ?
    list_token_title = tokenization_pipe_title(df)
    df['title_token_final'] = list_token_title
    
    return df 


In [16]:
def df_scraped_tokenized_to_csv(folder, df):
    """Exporting the df with token into a csv
    args
        folder:: the pc location
        df:: df scraped
    """
    df = get_df_scraped_tokenized(df)
    df.to_csv(os.path.join(folder, 'df_scraped_token.csv'))

## Data loading

In [18]:
# Get scraped data and clean it
df = load_clean_scraped_data(folder)

## Tokenization

In [None]:
df_tokenized = get_df_scraped_tokenized(df)