In [1]:
import pandas as pd
import unidecode
import os

import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [2]:
# Paths to data
path_to_data = "../data/"
path_to_external_data = os.path.join(path_to_data, "external-data/")

In [3]:
language_dict = {'fr' : 'French',
                'en' : 'English',
                'es' : 'Spanish',
                'pt' : 'Portuguese'}

In [4]:
def clean_medra(language_code):

    medra = pd.read_excel(os.path.join(path_to_external_data,'MedRa','WB_Extract_covid_MedDRA_multilangue_2020_07_02.xlsx'),
                         sheet_name = language_dict.get(language_code))

    # Clean column 'pt_name' (used as column name later - no spaces, no accents)
    medra['pt_name'] = medra['pt_name'].apply(lambda x: unidecode.unidecode(x))
    medra['pt_name'] = medra['pt_name'].str.replace("'", " ").str.replace(' ', '_')

    # Cleaning of column llt_name : remove accents and apostrophes
    medra['clean_llt_name'] = medra['llt_name'].apply(lambda x: unidecode.unidecode(x))
    medra['clean_llt_name'] = medra['clean_llt_name'].str.replace("'", " ")

    # Stem
    stemmer = SnowballStemmer(language_dict.get(language_code).lower())
    for col in ['llt_name','clean_llt_name'] :
        medra['stemmed_'+col] = medra[col].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
        
    # Create new col for llt_name clean, without spaces and accents, to be used as a spark columns
    medra['llt_name_as_col'] = medra['clean_llt_name'].str.replace(' ','_')
    
    medra.to_csv(os.path.join(path_to_external_data,'MedRa','medra_lang','medra_'+str(language_code)+'.csv'), index=False)

In [5]:
for language_code in list(language_dict.keys()):
    clean_medra(language_code)