## Text_Preprocessing

- word tokenisation
- lowercasing
- special characters & punctuations
- numbers 
- stop words 

In [1]:
import pandas as pd
import numpy as np
from os import mkdir
from class_corpus_prepare import corpus_prepare, preprocessing_text
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /Users/jlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def apply_preprocessing(data, remove_stopwords = False):
    ''' apply preprocessing steps by
        calling preprocessing_text class methods
    '''
    # tokenisation
    token_data = [word_tokenize(i) for i in data]
    prepro = preprocessing_text()
    # remove punctuations
    token_processed = [prepro.remove_punctuations(j) for j in token_data]
     # remove numbers
    token_processed = [prepro.remove_numbers(t) for t in token_processed]
    # remove stopwords
    if remove_stopwords == 'nltk':
        token_processed = [prepro.remove_stopwords(t, 'nltk') for t in token_processed]
    if remove_stopwords == 'spacy':
        token_processed = [prepro.remove_stopwords(t, 'spacy') for t in token_processed]
    # detokenise it to strings of words for saving it for df and csv
    text_processed = [' '.join(t) for t in token_processed]
        
    return text_processed         

In [3]:
def save_as_df(X, y):
    ''' apply different types of preprocessing 
        and store the processed texts in a dataframe. 
    '''
    punc_num = apply_preprocessing(X)
    nltk = apply_preprocessing(X, remove_stopwords = 'nltk')
    spacy = apply_preprocessing(X, remove_stopwords = 'spacy')
    label = y
    # save it to a dataframe
    df_processed = pd.DataFrame()
    df_processed['standard'] = punc_num
    df_processed['stopwords_nltk'] = nltk
    df_processed['stopwords_spacy'] = spacy
    df_processed['label'] = label
    # shuffle rows of each df
    df_processed = df_processed.sample(frac = 1).reset_index(drop = True)
    
    return df_processed

In [4]:
mkdir('./processed_text_df')
article_list = ['2', '3', '5', '6', '8', '10', '11', '13', '14']

# preprocess texts from each article 
for i in article_list:
    art = pd.read_csv('./articles/article_{}'.format(i))
    prepare = corpus_prepare(art) 
    prepare.remove_missing_case() ## remove 'unavailable' cases 
    train_x, test_x, train_y, test_y = prepare.train_test_split() ## split dataset into trainset(balanced) testset(keep real ratio)
    print("number of training cases :{} for article_{}".format(len(train_x), i))
    print("number of test cases:{} for article_{}".format(len(test_x), i))
    
    ## apply preprocessing & save it as df
    train_df = save_as_df(train_x, train_y)
    test_df = save_as_df(test_x, test_y)
    
    ##save df to csv
    train_df.to_csv('./processed_text_df/train_article_{}.csv'.format(i), index = False)
    test_df.to_csv('./processed_text_df/test_article_{}.csv'.format(i), index = False)
    

number of training cases :154 for article_2
number of test cases:67 for article_2
number of training cases :466 for article_3
number of test cases:204 for article_3
number of training cases :386 for article_5
number of test cases:191 for article_5
number of training cases :1152 for article_6
number of test cases:667 for article_6
number of training cases :616 for article_8
number of test cases:129 for article_8
number of training cases :262 for article_10
number of test cases:67 for article_10
number of training cases :66 for article_11
number of test cases:27 for article_11
number of training cases :200 for article_13
number of test cases:177 for article_13
number of training cases :336 for article_14
number of test cases:40 for article_14
