In [1]:
import sys

sys.path.append('../')

import re
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from tqdm import tqdm
import time
import random


In [2]:
from src.symspellpy import SymSpell

In [3]:
MIN_LEN = 20
MAX_LEN = 400

In [4]:
train_input = pd.read_csv('../data/ALTA_2017/train_input.csv')
train_output = pd.read_csv('../data/ALTA_2017/train_output.csv')

In [5]:
train_input.head()

Unnamed: 0,id,original
0,0,'Gondoliers' By Teachers Colleae The Adelaide ...
1,1,"Man Cufc Spoilt Bmbti Of Cimw Hertm BrielloE,..."
2,2,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...
3,3,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...
4,4,REPORTS FROM RURAL CENTRES AVON An evening was...


In [6]:
train_input.original.iloc[0]

"'Gondoliers' By Teachers Colleae The Adelaide . Teachers' College will present its annual Gilbert and Sullivan onera season in the Unley Town ORix irom April £i to 30.- ??-..?;.. Under the direction of Mr. Alva Penrose, who will again conduct, the students will perform 'The Gondoliers' which was given in the Tivoli Theatre two years ago. There will be nine principals and a chorus of 48. Bookings will open at Cawthorne's on April 13."

In [7]:
train_output.head()

Unnamed: 0,id,solution
0,0,"""Gondoliers"" By Teachers College The Adelaide ..."
1,1,"Lion Cub Spoilt Baby of Circus Herts Briellos,..."
2,2,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...
3,3,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...
4,4,REPORTS FROM RURAL CENTRES AVON An evening was...


In [8]:
train_input.shape, train_output.shape

((6000, 2), (6000, 2))

In [9]:
train = pd.merge(train_input, train_output, how='inner')
train['solution'] = train.solution.apply(lambda x: re.sub('[^a-z0-9 ]', '', x.strip().lower()))
train['original'] = train.original.apply(lambda x: re.sub(r'[^a-z0-9 ]', '', x.strip().lower()))

In [10]:
kf = KFold(n_splits=5)

for train_index, val_index in kf.split(train):
    break

In [11]:
val = train.iloc[val_index]
train = train.iloc[train_index]

In [12]:
train['text_len'] = train.original.apply(lambda x: len(x.split()))
val['text_len'] = val.original.apply(lambda x: len(x.split()))

In [13]:
print (train.text_len.describe(), val.text_len.describe())

count    4800.000000
mean      468.942083
std       636.235791
min        17.000000
25%       113.000000
50%       241.000000
75%       574.000000
max      9261.000000
Name: text_len, dtype: float64 count    1200.000000
mean      439.170833
std       608.238687
min        20.000000
25%       110.000000
50%       215.000000
75%       536.500000
max      7808.000000
Name: text_len, dtype: float64


In [14]:
val = val[val.text_len < MAX_LEN].reset_index(drop=True)
print (val.shape)

(825, 4)


In [15]:
sym_spell = SymSpell()

#sym_spell.load_dictionary("../src/symspellpy/frequency_dictionary_en_82_765.txt",0,1," ")
sym_spell.create_dictionary("../src/symspellpy/frequency_dictionary_en_82_765.txt")       

../src/symspellpy/frequency_dictionary_en_82_765.txt


True

In [16]:
text = val.original.iloc[1]

result = sym_spell.word_segmentation(text)

In [17]:
result.corrected_string

'man cuff spoilt bambi of crime herm brillo us member of silvered cd rent with a lion cob born three weeks ago at karina photo was taken in a circus caravan at levels yesterday a lion cub which is being reared on a bottle by a woman member of silvers circus will be on show in adelaide soon the cub whose mother has neglected it since it was bomb at karina three weeks ago travels in its foster mothers lap when the circus moves from town to town and sleeps in her caravan the circus was formed in sydney only three years ago and is done south australia for the first time it has just completed a successful two month tour of say country districts the unit will be in marie lathi from april of to april of at the morpheus street bridge site and will be open nightly at a pin with matinees on saturdays wednesdays and holidays a feature of the programme will be gwen keillor down act in which she performs balancing acts on ropes left above the ground and with no nets there will be both overseas and 

In [18]:
text

'man cufc spoilt bmbti of cimw  hertm brielloe u member of silvere cjrcnt witb a lion cob born three weeks ago at kadina photo was taken in a circus caravan at clenels yesterday a lion cub which is being reared on a bottle by a woman member of silvers circus will be on show in adelaide soon the cub whose mother has neglected it since it was bom at kadina three weeks ago travels in its foster mothers lap when the circus moves from town to town and sleeps in her caravan the circus was formed in sydney only three years ago and is vidone south australia for the first time it has just completed a successful twomonth tour of sa country districts the unit will be in arielatrhi from april 12 to april 23 at the morphett street bridge site and will be open nightly at 8 pin with matinees on saturdays wednesdays and holidays a feature of the programme will be gwen kiellors down act in which she performs balancing acts on ropes 40 ft above the ground and with no nets there will be both overseas and

In [19]:
output = []

for i in tqdm(range(len(val))):
    output.append(sym_spell.word_segmentation(val.original.iloc[i]).corrected_string)

100%|██████████████████████████████████████████████████████████████████████████████| 825/825 [4:44:31<00:00, 20.69s/it]


In [20]:
val['predicted_text'] = output

In [21]:
import numpy as np
import  nltk.translate.bleu_score as bleu

In [22]:
def WRR(text1,text2):
    a = set(text1.lower().split())
    b = set(text2.lower().split())
    
    if (len(a) == 0) and (len(b) == 0):
        return .5
    
    c = a.intersection(b)
    return float(len(c))/(len(a) + len(b) - len(c))

def levenshtein(seq1, seq2):
    seq1 = seq1.lower()
    seq2 = seq2.lower()
    
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #print (matrix)
    return (matrix[size_x - 1, size_y - 1])

def CRR(text1, text2):
    try:
        return 1 - float(levenshtein(text1,text2))/max(len(text1),len(text2))
    except:
        return 0

def bleu_score(text1,text2):
    return bleu.sentence_bleu([text1.lower().split()],text2.lower().split())

In [23]:
val['WRR_1'] = val.apply(lambda x: WRR(x.original, x.solution), axis=1)
val['WRR_2'] = val.apply(lambda x: WRR(x.predicted_text, x.solution), axis=1)

#val['CRR_1'] = val.apply(lambda x: CRR(x.original, x.solution), axis=1)
#val['CRR_2'] = val.apply(lambda x: CRR(x.predicted_text, x.solution), axis=1)

val['BLEU_1'] = val.apply(lambda x: bleu_score(x.original, x.solution), axis=1)
val['BLEU_2'] = val.apply(lambda x: bleu_score(x.predicted_text, x.solution), axis=1)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [24]:
val[['WRR_1', 'WRR_2', 'BLEU_1', 'BLEU_2']].describe()

Unnamed: 0,WRR_1,WRR_2,BLEU_1,BLEU_2
count,825.0,825.0,825.0,825.0
mean,0.831961,0.636713,0.816075,0.5931994
std,0.123283,0.153985,0.1443029,0.1828865
min,0.040323,0.008333,9.016668999999999e-232,5.761139e-232
25%,0.791667,0.545455,0.7695525,0.5001174
50%,0.861111,0.653846,0.8504415,0.6263214
75%,0.911392,0.75,0.9090551,0.7281174
max,1.0,1.0,1.0,1.0


In [25]:
print (val[val.WRR_1 < val.WRR_2].shape, val[val.BLEU_1 < val.BLEU_2].shape)

(24, 9) (27, 9)


In [26]:
val.to_csv('../results/ALTA_2017/symspell_validation_output.csv',index=False)