In [1]:
pwd

'P:\\Projects\\puer\\notebooks'

In [2]:
cd ..

P:\Projects\puer


In [3]:
from src.utils import load_corpus
import pandas as pd
from collections import Counter
from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters


In [4]:
def unique_words(corpus):
    c = Counter()
    for s in corpus:
        c.update([w for w in s])
    return c

def coverage(words, lx_words):
    smaller, bigger = (words, lx_words) if len(words) < len(lx_words) else (lx_words, words)
    inner_counts = sum([1 for w in smaller if w in bigger])
    print(f'inner counts: {inner_counts}')
    print(f'words: {len(words)}')
    return  inner_counts / len(words)

Data

In [5]:
train = load_corpus('data/processed/SemEval14/SemEval14_train.csv')
test = load_corpus('data/processed/SemEval14/SemEval14_test.csv')

In [6]:
train.head()

Unnamed: 0,ASP,CLS,SENT
0,price,1,and cheap !
1,service,-1,the staff is n't the friendliest or most compe...
2,service,-1,"the service is always bad though , do n't expe..."
3,miscellaneous,1,i absolutely love this place ! ! !
4,ambience,1,a restaurant that does n't try to do anything ...


In [7]:
test.head()

Unnamed: 0,ASP,CLS,SENT
0,food,1,the bread is top notch as well .
1,service,1,i have to say they have one of the fastest del...
2,food,1,food is always fresh and hot- ready to eat !
3,food,1,did i mention that the coffee is outstanding ?
4,ambience,1,"certainly not the best sushi in new york , how..."


### Lexicon

In [9]:
lx = pd.read_csv('data/processed/lexicon/lexicon_table.csv')

In [10]:
lx.head()

Unnamed: 0,WORD,MPQA,OPENER,OL,SWN,VADER
0,$:,-0.375,-0.375,-0.375,-0.375,-0.375
1,%),-0.1,-0.1,-0.1,-0.1,-0.1
2,%-),-0.375,-0.375,-0.375,-0.375,-0.375
3,&-:,-0.1,-0.1,-0.1,-0.1,-0.1
4,&:,-0.175,-0.175,-0.175,-0.175,-0.175


### Lexicon coverage in train

In [11]:
c_train = unique_words(train.SENT.str.split())

In [12]:
train_words = dict(c_train.most_common()).keys()

In [13]:
len(train_words)

4435

In [14]:
len(lx)

153625

In [15]:
coverage(train_words, lx.WORD.tolist())

inner counts: 2985
words: 4435


0.673055242390079

### Lexicon coverage in test

In [16]:
c_test = unique_words(test.SENT.str.split())

In [17]:
test_words = dict(c_test.most_common()).keys()

In [18]:
len(test_words)

2199

In [19]:
coverage(test_words, lx.WORD.tolist())

inner counts: 1575
words: 2199


0.7162346521145976

### Lexicon coverage in CV

In [21]:
kf = KFold(n_splits=6, shuffle=True, random_state=42)

In [25]:
for k, (train_idx, val_idx) in enumerate(kf.split(train)):
    print(f'-- FOLD{k+1} --')
    _train, _dev = train.iloc[train_idx],train.iloc[val_idx]
    _c_train = unique_words(_train.SENT.str.split())
    _train_words = dict(_c_train.most_common()).keys()
    _train_cov = coverage(_train_words, lx.WORD.tolist())
    print(f'Train coverage: {_train_cov:.2%}')
    _c_dev = unique_words(_dev.SENT.str.split())
    _dev_words = dict(_c_dev.most_common()).keys()
    _dev_cov = coverage(_dev_words, lx.WORD.tolist())
    print(f'Dev coverage: {_dev_cov:.2%}')

-- FOLD1 --
inner counts: 2779
words: 4086
Train coverage: 68.01%
inner counts: 1303
words: 1788
Dev coverage: 72.87%
-- FOLD2 --
inner counts: 2771
words: 4087
Train coverage: 67.80%
inner counts: 1304
words: 1778
Dev coverage: 73.34%
-- FOLD3 --
inner counts: 2763
words: 4079
Train coverage: 67.74%
inner counts: 1312
words: 1781
Dev coverage: 73.67%
-- FOLD4 --
inner counts: 2792
words: 4113
Train coverage: 67.88%
inner counts: 1320
words: 1784
Dev coverage: 73.99%
-- FOLD5 --
inner counts: 2782
words: 4084
Train coverage: 68.12%
inner counts: 1302
words: 1796
Dev coverage: 72.49%
-- FOLD6 --
inner counts: 2825
words: 4141
Train coverage: 68.22%
inner counts: 1238
words: 1705
Dev coverage: 72.61%
