In [15]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 150)
import spacy
print('The spaCy version is {}.'.format(spacy.__version__))
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
import contractions
import re

import bert

The spaCy version is 2.1.6.


## Load and filter data

In [28]:
n = 1000
uif = pd.read_csv('./uif_data_100k.tsv', sep='\t')
print(uif.shape)
uif['PostTextInEnglish'].iloc[n]

(100000, 2)


'Code: 0x803fb005 |'

In [17]:
print (uif.columns)

Index(['Unnamed: 0', 'PostTextInEnglish'], dtype='object')


In [29]:
uif.rename(columns={'Unnamed: 0': 'Id','PostTextInEnglish':'verbatim'}, inplace=True)
print (uif.columns)

Index(['Id', 'verbatim'], dtype='object')


In [30]:
# Replace separator
uif['verbatim'] = uif['verbatim'].apply(lambda x: x.replace(" |", "."))

# Remove NULL lines
uif = uif[uif['verbatim']!='#NULL#']
uif.head()

Unnamed: 0,Id,verbatim
0,5059664,to.
2,4745123,bbc.com I keep on getting ads on this web site from Australia while I am logged into this site its absolutely so frustrating so how can I get rid ...
4,1816600,You don't hear.
5,10328111,Can t reach this page https://login.comcast.net/login?r=comcast.net.
6,3491640,My laptop is not communicating with the printer I get the message: Cannot communicate with the printer. Turn Enable bi-directional support on ...


In [46]:
uif.tail(100)

Unnamed: 0,Id,verbatim,word_count,verbatim_processed
99863,9891094,Code: 0x80072F78.,2,Code: 0x80072F78.
99865,4967024,Audio cannot be played even though audio driver is installed.,10,Audio cannot be played even though audio driver is installed.
99866,5956972,Birthday. Birthdays that were registered in people are too slow to propagate,12,Birthday. Birthdays that were registered in people are too slow to propagate
99867,9683155,"Code: 0x80070057. This download has been trying to complete itself for over a week! The rest are very, very, slow!",20,"Code: 0x80070057. This download has been trying to complete itself for over a week! The rest are very, very, slow!"
99868,6637733,"My Surface Book 2 is connected to Surface Dock via Ethernet on the company network. After about 4-6 hours, the Ethernet disappears and is no longe...",68,"My Surface Book 2 is connected to Surface Dock via Ethernet on the company network. After about 4-6 hours, the Ethernet disappears and is no longe..."
99869,2164214,"Bluetooth devices can be disconnected, although recognized as being related.",10,"Bluetooth devices can be disconnected, although recognized as being related."
99870,3825001,Bluetooth paired but unable to connect..,6,Bluetooth paired but unable to connect..
99871,9355201,Code: 0x80070422.,2,Code: 0x80070422.
99872,9715512,ACG [test] from NEBA-XDWEHG88 - Ignore.,6,ACG [test] from NEBA-XDWEHG88 - Ignore.
99873,6334109,ACG [test] from RR1WDGTEST0343 - Ignore.,6,ACG [test] from RR1WDGTEST0343 - Ignore.


In [49]:
print (uif.shape)
print (uif_clean.shape)
uif_clean.tail(100)


(70672, 4)
(58761, 4)


Unnamed: 0,Id,verbatim,word_count,verbatim_processed
99840,9630786,I haven't ben able to use my 11.62 Euros since you upgraded. I have 11.62 Euros which I needed and used to phone my cousins in Australia. I ca...,73,I have not ben able to use my 11.62 Euros since you upgraded. I have 11.62 Euros which I needed and used to phone my cousins in Australia. I c...
99842,2378303,Code: 0x80070422. does not install the programs error,8,Code: 0x80070422. does not install the programs error
99843,8323781,Useless. Problem was not identified.,5,Useless. Problem was not identified.
99844,5960357,"Online contacts - Skype screen size No online only contacts option, huge skype app - why take a good product and make it difficult yet again Mi...",27,"Online contacts - Skype screen size No online only contacts option, huge skype app - why take a good product and make it difficult yet again Mi..."
99846,2614588,ACG [test] from G-2323D0153 - Ignore.,6,ACG [test] from G-2323D0153 - Ignore.
99847,8827409,"app does not work properly. Security Center tells me that there is an app that doesn't work correctly, but does not tell me which. what should I d...",42,"app does not work properly. Security Center tells me that there is an app that does not work correctly, but does not tell me which. what should I ..."
99848,10369396,nothing can be heard. that not there volume or you can climb the volume as,15,nothing can be heard. that not there volume or you can climb the volume as
99852,8616137,This page is having a problem loading https://apps.facebook.com/papapear/?fb_source=bookmark. It wont load,11,This page is having a problem loading https://apps.facebook.com/papapear/?fb_source=bookmark. It will not load
99855,3498857,"Problems with birthdays calendars-birthday is missing, cannot add new birthday, etc",11,"Problems with birthdays calendars-birthday is missing, cannot add new birthday, etc"
99857,11223947,Code: 0X80131500. The win10 App Store won't open.,8,Code: 0X80131500. The win10 App Store will not open.


In [31]:
uif['word_count'] = uif['verbatim'].apply(lambda x: len(x.split()))
uif_clean = uif[uif['word_count'] > 3]
uif_clean.shape

(58761, 3)

## Clean and preprocess text

In [32]:
def preprocess(text):
    """
    This function performs preprocessing on a single feedback text string. It
    does the following operations:
    - expand contractions
    - convert to lowercase
    - drop all non-alpha characters
 
    INPUTS:
        text = str, feedback verbatim string
    OUTPUTS:
        text = str, preprocessed verbatim string
    """
    # get rid of #DELIM#
    text = text.replace("#DELIM#", "")
    # fix contractions
    text = contractions.fix(text)
    return text

In [36]:
uif_clean['verbatim_processed'] = uif_clean['verbatim'].apply(lambda verbatim: preprocess(verbatim))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [40]:
uif_clean.shape

(58761, 4)

## Sentence Tokenization

In [23]:
#import spacy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz --no-deps
nlp = spacy.load("en_core_web_sm")

In [42]:
# custom tokenizer class
prefix_re = re.compile(r'''^[[("']''')
suffix_re = re.compile(r'''[])"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=simple_url_re.match)

#nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(uif_clean['verbatim'].iloc[3])
print([t.text for t in doc])

['OneNote', 'does', 'not', 'open.']


In [43]:
# nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
uif_clean['text'] = uif_clean['verbatim_processed'].apply(lambda x: [sent.string.strip() for sent in nlp(x).sents])

KeyboardInterrupt: 

## Save to text file

In [53]:
with open('./uif_pretrain_100k_No_Space.txt', 'w') as f:
    for fb in uif_clean['verbatim_processed']:
        f.write("%s\n" % fb)
        #for sent in fb:
            #f.write("%s " % sent)
        #f.write("\n")

In [51]:
uif_clean.shape

(58761, 4)

In [35]:
print (uif.shape)

(70672, 4)


In [14]:
print (uif.shape)
print (type(uif['verbatim_processed']))
len(uif['text'].iloc[1])

(8243156, 4)


KeyError: 'text'