In [1]:

#A través del siguiente programa se realiza una limpieza del WeeBit Dataset 
#Se obtiene un documento CSV con los datos limpios

#1) Conversión a Pandas Dataframe
#2) Limpieza
#3) Elección de algunos textos de nivel 4
#4) Train y Test set
#5) Salvar en csv
#"""
import pandas as pd
import os

# Detección de lenguaje


from langdetect import detect_langs

#  train-test split
from sklearn.model_selection import train_test_split


#  PANDAS DATAFRAME
import patoolib
patoolib.extract_archive("WeeBit.rar")


def get_weebit_as_dataframe():
   

    # hay 5 niveles de legibilidad en el WeeBit dataset
    levels = [0, 1, 2, 3, 4]
    dataset_path = "./WeeBit/"

    texts = list()
    for level in levels:
        files = os.listdir(dataset_path + str(level))
        for file in files:
            with open(dataset_path + str(level) + "/" + file, 'r', encoding='latin-1') as txt_file:
                # read the entire text as string (texts are quite small)
                text_string = txt_file.read()
                texts.append([text_string, level])
            
    # creación del pandas dataframe       
    df = pd.DataFrame(texts, columns = ['Text', 'Level'])
    
    return df


#Limpieza


def _get_english_prob(langs):
    return {result.lang: result.prob for result in langs}.get('en', 0.0)


def _remove_non_english(df, english_prob_threshold = 0.99):
  
    langs = df['Text'].apply(detect_langs)
    english_probs = langs.apply(_get_english_prob)
    df = df[english_probs > english_prob_threshold]
    return df



NON_CONTEXT_LINES = ['This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled.',
                     'While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience.',
                     'Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.',
                     'The BBC is not responsible for the content of external internet sites.',
                     'For information on how to enable JavaScript please go to the',
                     'You will not be able to see this content until you have JavaScript switched on.',
                     'Your web browser does not have JavaScript switched on at the moment.',
                     'You have disabled Javascript, or are not running Javascript on this browser.',
                     'Go to the',
                     'go to the',
                     'The enhanced version of the site requires the Flash 8 plugin (or higher) to be installed and JavaScript to be enabled on your browser.',
                     'To find out how to turn on JavaScript',
                     'The enhanced version of the site requires the Flash 8 plugin (or higher) to be installed and JavaScript to be enabled on your browser.',
                     'To find out how to install a Flash plugin,',
                     'The enhanced version of the site requires the Flash 8 plugin (or higher) to be installed and JavaScript to be enabled on your browser.',
                     'Download the Adobe Flash player to view this conten.',
                     'All trademarks and logos are property of Weekly Reader Corporation.',
                     'measures published under license with MetaMetrics, Inc.']


def _remove_non_content_lines(text):
   
    for line in NON_CONTEXT_LINES:
        text = text.replace(line, '')
    return text.strip()


def clean_weebit(df):
   
    
   
    df['Text'] = df['Text'].astype(str)
    df.dropna(inplace=True)
    
    
    df['Text'] = df['Text'].str.replace(".\n", ". ")
    df['Text'] = df['Text'].str.replace("\n", ". ")
    
   
    df = df[df['Text'].str.len() != 0]
    
    df = df.drop_duplicates("Text")

    df = _remove_non_english(df)
    
    df['Text'] = df['Text'].apply(_remove_non_content_lines)
    
    df['Text'] = df['Text'].astype(str)
    df.reset_index(drop=True, inplace=True)
    return df





def level_4_undersampling(df, n_level4 = 800):
   
    
    df_4 = df[df.Level == 4]
    df_ = df[df.Level != 4]
    
    df_4 = df_4.sample(n=n_level4)
    
    df = pd.concat([df_, df_4])
    
    df.reset_index(drop=True, inplace=True)
    return df


 


TEST_SIZE = 0.2
DATASET_CSV = "weebit.csv"
TRAIN_SET_CSV = "weebit_train.csv"
TEST_SET_CSV = "weebit_test.csv"


def main():
   
    
    df = get_weebit_as_dataframe()
    print("Read WeeBit as dataframe.")
    
    df = clean_weebit(df)
    print("Cleaned the dataset.")
    
    df = level_4_undersampling(df)
    print("Undersampled level 4 class.")
    
    train_df, test_df = train_test_split(df,
                                     test_size = TEST_SIZE,
                                     shuffle = True,
                                     stratify = df.Level)
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    print("Split into train and test set.")
    
    df.to_csv(DATASET_CSV, encoding='utf-8')
    train_df.to_csv(TRAIN_SET_CSV, encoding='utf-8')
    test_df.to_csv(TEST_SET_CSV, encoding='utf-8')
    print("Saved to csv.")
    print("Final dataset is:")
    print(df.Level.value_counts())
    

if __name__ == "__main__":
    main()

patool: Extracting WeeBit.rar ...
patool: running "C:\Program Files\WinRAR\rar.EXE" x -- C:\Users\susi0\TFM\WeeBit.rar
patool:     with cwd=.\Unpack_07nkpknv
patool: ... WeeBit.rar extracted to `WeeBit'.
Read WeeBit as dataframe.
Cleaned the dataset.
Undersampled level 4 class.
Split into train and test set.
Saved to csv.
Final dataset is:
4    800
2    798
1    788
3    643
0    610
Name: Level, dtype: int64
