In [1]:
import os
import re
import string
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import nltk
import torchtext
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Data preparation

In [2]:
df = pd.read_csv(f'../inputs/selected_attributes.tsv.gz', sep='\t', names=['name', 'text'], na_filter=False, dtype=object, encoding='utf-8')
print(df.shape)

(2095358, 2)


In [3]:
classes=df.groupby(by=["name"]).count()
classes.shape

(42, 1)

In [4]:
classes_sorted = classes.sort_values(by='text', ascending=False)

In [5]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/juno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/juno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
stopset = set(stopwords.words('english'))

### Removing numbers to reduce dimensions

In [7]:
matches = {'numeric': [re.compile('^[-+]*\d+\.\d+$'), re.compile('^[-+]*\.\d+$')],
           'number': [re.compile('^[-+]*\d+$')],
           'float': [re.compile('^[-+]?\d+.\d+[eE][-+]?\d+$')]}

def translate_if_number(t):
    if bool(re.search(r'\d', t)):
        for k, ms in matches.items():
            for m in ms:
                if m.match(t):
                    return k
    return t

In [8]:
def translate(lex, hooks):
    for func in hooks:
        lex = func(lex)
    return lex    

### Tokenizers for various cases

In [9]:
def tokenize(text, tokenizer, translators, lc: bool, stops: bool):
    seq = []
    
    for w in tokenizer(text):
        if lc:
            w = w.lower()
        if stops and w in stopset:
            continue
        if w in string.punctuation:
            continue
        w = translate(w, translators) 
        seq.append(w)
        
    return ' '.join(seq)    

def get_tokenizers(tokenizer, translators, lc: bool, stops: bool):
    def _foo(text):
        return tokenize(text, tokenizer, translators, lc, stops)
    return _foo

custom_tokenizers = {}
for n, lc, stops in [('nltk-word-cased', False, False), ('nltk-word-lowcase', True, False), ('nltk-word-lowcase-stops', True, True)]:
    custom_tokenizers[n] = get_tokenizers(word_tokenize, [translate_if_number], lc, stops)        

In [10]:
%load_ext autoreload
%autoreload 2
from splitter import parallelize 

### For current exercize will use nltk word lowcase with stop words, no punctuations, converting continues numerical literals to word "number"  

In [11]:
def convert_row(row):
    return custom_tokenizers['nltk-word-lowcase'](row.text)

In [12]:
%%time
df['converted'] = parallelize(df, convert_row, 8)

  return bound(*args, **kwds)


CPU times: user 558 ms, sys: 389 ms, total: 947 ms
Wall time: 38.1 s


In [13]:
df[150000:150200]

Unnamed: 0,name,text,converted
150000,description_sam,nondiarrheal_control,nondiarrheal_control
150001,description_sam,These data have been produced as part of The 3...,these data have been produced as part of the n...
150002,description_sam,Internal Transcribed Spacer region,internal transcribed spacer region
150003,description_sam,low residual feed intake,low residual feed intake
150004,description_sam,wet soil,wet soil
...,...,...,...
150195,description_sam,Cruise: October2017; CTD Station: 55; Depth: 150m,cruise october2017 ctd station number depth 150m
150196,description_sam,These data have been produced as part of The 3...,these data have been produced as part of the n...
150197,description_sam,rectal swab,rectal swab
150198,description_sam,Texel Ewe mum Abomasum,texel ewe mum abomasum


### Dedupping and saving for next steps

In [15]:
df_converted_deduped = df[['name', 'converted']].drop_duplicates(ignore_index=True)

In [16]:
df_converted_deduped.to_csv('../preprocessed/attribute.seqs.tsv', sep = '\t', index=False, header=False)