In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import datetime

In [2]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# Tokenizing preprocessing functions; ripped from https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

In [3]:
def strip_html(text):
    """remove html artifacts"""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    """remove square brackets around things"""
    return re.sub('\[[^]]*\]', '', text)

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words, unless word is entirely uppercase"""
    new_words = []
    for word in words:
        if word != word.upper():
            new_word = word.lower()
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words, unless it is ! or ?"""
    new_words = []
    for word in words:
        if (word != '!') and (word != '?'):
            new_word = re.sub(r'[^\w\s]', '', word)
        else:
            new_word = word
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words, except 'no', 'not' """
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)                
                
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def preprocess_review(text):
    """Apply some subset of the preprocessing steps"""
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = replace_contractions(text)

    words = word_tokenize(text)
    words = to_lowercase(words)
    words = remove_non_ascii(words)
    words = lemmatize_verbs(words)
    words = remove_punctuation(words)
    return words

def get_month(date):
    """returns month from date"""
    return date.split(' ')[0]

def get_year(date):
    """returns month from date"""
    return date.split(' ')[-1]

In [7]:
train_data = pd.read_csv('../raw_data/drugsComTrain_raw.tsv',sep='\t')
test_data = pd.read_csv('../raw_data/drugsComTest_raw.tsv',sep='\t')

train_data.columns = ['identifier']+list(train_data.columns[1:])
train_data['year'] = train_data['date'].apply(get_year)
train_data['month'] = train_data['date'].apply(get_month)

test_data.columns = ['identifier']+list(test_data.columns[1:])
test_data['year'] = test_data['date'].apply(get_year)
test_data['month'] = test_data['date'].apply(get_month)

cleaned_reviews_train = train_data['review'].apply(preprocess_review)
cleaned_reviews_test = test_data['review'].apply(preprocess_review)

train_data['cleaned_review'] = cleaned_reviews_train
test_data['cleaned_review'] = cleaned_reviews_test

train_data.to_csv('local_data/alec_train.csv')
test_data.to_csv('local_data/alec_test.csv')

In [10]:
train_data['review'][20]

'"Spring of 2008 I was hospitalized with pnuemonia and diagnosed with Lyme diease and full blown AIDS with CD4 count of &quot;11&quot; viral load some number so high in the millions I could never remember. I was taking Combivir and Kaletra with Dapsone for the 1st year then it stopped working. I started Kaletra with the Dapsone my CD4 count is now 209 and rising. For a few weeks I was very aggressive and broke all my dishes in the house LOL. I take vitamin supplements and drink a boost pluz every day. LIfe is good now!"'

In [11]:
train_data['cleaned_review'][20]

['spring',
 'of',
 '2008',
 'I',
 'be',
 'hospitalize',
 'with',
 'pnuemonia',
 'and',
 'diagnose',
 'with',
 'lyme',
 'diease',
 'and',
 'full',
 'blow',
 'AIDS',
 'with',
 'CD4',
 'count',
 'of',
 '11',
 'viral',
 'load',
 'some',
 'number',
 'so',
 'high',
 'in',
 'the',
 'millions',
 'I',
 'could',
 'never',
 'remember',
 'I',
 'be',
 'take',
 'combivir',
 'and',
 'kaletra',
 'with',
 'dapsone',
 'for',
 'the',
 '1st',
 'year',
 'then',
 'it',
 'stop',
 'work',
 'I',
 'start',
 'kaletra',
 'with',
 'the',
 'dapsone',
 'my',
 'CD4',
 'count',
 'be',
 'now',
 '209',
 'and',
 'rise',
 'for',
 'a',
 'few',
 'weeks',
 'I',
 'be',
 'very',
 'aggressive',
 'and',
 'break',
 'all',
 'my',
 'dish',
 'in',
 'the',
 'house',
 'LOL',
 'I',
 'take',
 'vitamin',
 'supplement',
 'and',
 'drink',
 'a',
 'boost',
 'pluz',
 'every',
 'day',
 'life',
 'be',
 'good',
 'now',
 '!']