In [0]:
import libvoikko

import numpy as np
import pandas as pd
import pickle

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
register_map = {
    'NA': {
        'description': 'Narrative',
        'sub': {
            'NE': 'New reports / news blogs', 
            'SR': 'Sports reports',
            'PB': 'Personal blog', 
            'HA': 'Historical article',
            'FC': 'Fiction', 
            'TB': 'Travel blog',
            'CB': 'Community blogs', 
            'OA': 'Online article',
        }
    },
    'OP': {
        'description': 'Opinion',
        'sub': {
            'OB': 'Personal opinion blogs', 
            'RV': 'Reviews',
            'RS': 'Religious blogs/sermons',
            'AV': 'Advice',
        }
    },
    'IN': {
        'description': 'Informal description',
        'sub': {
            'JD': 'Job description', 
            'FA': 'FAQs',
            'DT': 'Description of a thing', 
            'IB': 'Information blogs',
            'DP': 'Description of a person',
            'RA': 'Research articles',
            'LT': 'Legal terms / conditions',
            'CM': 'Course materials',
            'EN': 'Encyclopedia articles',
            'RP': 'Report',
        }
    },
    'ID': {
        'description': 'Interactive discussion',
        'sub': {
            'DF': 'Discussion forums',
            'QA': 'Question-answer forums',
        }
    },
    'HI': {
        'description': 'How-to/instructions',
        'sub': {
            'RE': 'Recipes',
        }
    },
    'IP': {
        'description': 'Informational persuasion',
        'sub': {
        }
    },
    'IG': {
        'description': 'Informational persuasion',
        'sub': {
            'DS': 'Description with intent to sell',
            'EB': 'News-opinion blogs / editorials',
        }
    },
    'LY': {
        'description': 'Lyrical',
        'sub': {
            'PO': 'Poems',
            'SL': 'Songs',
        }
    },
    'SP': {
        'description': 'Spoken',
        'sub': {
            'IT': 'Interviews', 
            'FS': 'Formal speeches',
        }
    },
    'OS': {
        'description': 'Others',
        'sub': {
             'MT': 'Machine-translated / generated texts',
        }
    }
}

In [0]:
# high level registers
hl_reg = list(register_map.keys())
# sub level registers
sl_reg = [s for sublist in register_map.values() for s in sublist['sub'].keys()]

In [0]:
class Transformer:
    def __init__(self):
        self.voikko = libvoikko.Voikko(u"fi")
        
    def as_baseform(self, word):
        """ A lemmatized form of the word """
        try:
            return self.voikko.analyze(word)[0]['BASEFORM']
        except IndexError:
            return word
        
    def suggest(self, word):
        """ Spell-checked form of the word """
        try:
            return self.voikko.suggest(word)[0]
        except IndexError:
            return word
        
    def transform(self, word):
        return self.as_baseform(self.suggest(word))

transformer = Transformer()

In [0]:
df_train_file = Path('./df_train.p')
df_dev_file = Path('./df_dev.p')
df_test_file = Path('./df_test.p')

if df_train_file.is_file() and df_dev_file.is_file() and df_test_file.is_file():
    df_train = pickle.load(open(df_train_file, 'rb'))
    df_dev = pickle.load(open(df_dev_file, 'rb'))
    df_test = pickle.load(open(df_test_file, 'rb'))
    print("DataFrames loaded from files.")
else:
    print("Generating DataFrames. This WILL take time.")
    orig_columns = ['reg', 'text']

    df_train = pd.read_csv('fincore-train.tsv', sep='\t', names=orig_columns)
    df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
    df_train = df_train.reindex(columns=orig_columns + hl_reg + sl_reg)

    df_dev = pd.read_csv('fincore-dev.tsv', sep='\t', names=orig_columns)
    df_dev = df_dev.sample(frac=1, random_state=42).reset_index(drop=True)
    df_dev = df_dev.reindex(columns=orig_columns + hl_reg + sl_reg)

    df_test = pd.read_csv('fincore-test.tsv', sep='\t', names=orig_columns)
    df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
    df_test = df_test.reindex(columns=orig_columns + hl_reg + sl_reg)

    # nested loops go brrrrrrrr.....
    for df in [df_train, df_dev, df_test]:
        # warning: insanely slow !!!
        df['transformed'] = df['text'].apply(lambda text:
            ' '.join([transformer.transform(word) for word in text.split(' ')
                      if word not in stopwords.words('finnish')]))
        df['reg'] = df['reg'].apply(lambda x: x.strip(' ').replace(' ', '_'))
        df[hl_reg + sl_reg] = 0
        for idx, row in df.iterrows():
            for reg in row['reg'].split('_'):
                if not reg:
                    continue
                df.iloc[idx, df.columns.get_loc(reg)] = 1
    pickle.dump(df_train, open(df_train_file, 'wb'))
    pickle.dump(df_dev, open(df_dev_file, 'wb'))
    pickle.dump(df_test, open(df_test_file, 'wb'))
    print("Done.")

DataFrames loaded from files.


In [0]:
df_train.head()

Unnamed: 0,reg,text,NA,OP,IN,ID,HI,IP,IG,LY,...,QA,RE,DS,EB,PO,SL,IT,FS,MT,transformed
0,CB_NA,lauantai Lauantai oli taas mukavan kylmä päiv...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,lauantai lauantai taas mukava kylmä Päivä ( +...
1,DT_IN,"Aurinkolämmöllä tarkoitetaan järjestelmää , j...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"aurinkolämpö tarkoittaa järjestelmä , käyttää..."
2,PB_NA,Sivut torstai 14. kesäkuuta 2012 Vaihde vapaa...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sivu torstai 14 kesäkuu 2012 vaihde vapaa kes...
3,HI,Valitse kieli : Hae rahoitusta EEP-rahoitusta...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,valita kieliä : hakea rahoitus EEP-rahoitusta...
4,PB_NA,"Friday , 13 May 2011 Näin aika kuluu Vanhoja ...",1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Friday , 13 Kay 2011 nähdä aika kulua vanha k..."


## Milestone 1

In [0]:
# only dev / test rows the labels of which are present in the training data
train_labels = set(df_train['reg'])
df_dev_ms1 = df_dev[df_dev['reg'].isin(train_labels)]
df_test_ms1 = df_test[df_test['reg'].isin(train_labels)]

texts = {
    'normal': {
        'train': [x for x in df_train['text']],
        'dev': [x for x in df_dev_ms1['text']],
        'test': [x for x in df_test_ms1['text']],
    },
    'transformed': {
        'train': [x for x in df_train['transformed']],
        'dev': [x for x in df_dev_ms1['transformed']],
        'test': [x for x in df_test_ms1['transformed']],
    }
}

label_encoder = LabelEncoder()
train_class_numbers = label_encoder.fit_transform(df_train['reg'])
dev_class_numbers = label_encoder.transform(df_dev_ms1['reg'])
test_class_numbers = label_encoder.transform(df_test_ms1['reg'])

ms1_data_file = Path('./ms1_data.p')
if ms1_data_file.is_file():
    ms1_data = pickle.load(open(ms1_data_file, 'rb'))
    print("Loaded Milestone 1 data from a file.")
else:
    print("Generating Milestone 1 data.")
    ms1_data = {}
    for txt in ['normal', 'transformed']:
        ms1_data[txt] = {}
        for mf in [1000, 5000, 10000, 20000]:
            vectorizer = CountVectorizer(max_features=mf,
                                         binary=True,
                                         ngram_range=(1, 1))
            train_feature_matrix = vectorizer.fit_transform(texts[txt]['train'])
            dev_feature_matrix = vectorizer.transform(texts[txt]['dev'])
            test_feature_matrix = vectorizer.transform(texts[txt]['test'])
            ms1_data[txt][fm] = {
                'vectorizer': vectorizer,
                'train_feature_matrix': train_feature_matrix,
                'dev_feature_matrix': dev_feature_matrix,
                'test_feature_matrix': test_feature_matrix,
            }
    pickle.dump(ms1_data, open(ms1_data_file, 'wb'))
    print("Done.")

Loaded Milestone 1 data from a file.


### 1.0: Baseline (naive Bayes)

In [0]:
def mnb_compute(X, y, mnb):
    predictions = mnb.predict(X)
    return (np.sum(predictions == y) / len(y), predictions)

bayes_results = {}
for txt, sets in ms1_data.items():
    bayes_results[txt] = {}
    for max_feats, stats in sets.items():
        mnb = MultinomialNB()
        mnb.fit(stats['train_feature_matrix'], df_train['reg'])
        
        train_accuracy, _ = mnb_compute(
            stats['train_feature_matrix'], df_train['reg'], mnb)
        
        dev_accuracy, _ = mnb_compute(
            stats['dev_feature_matrix'], df_dev_ms1['reg'], mnb)
        
        test_accuracy, test_predictions = mnb_compute(
            stats['test_feature_matrix'], df_test_ms1['reg'], mnb)
        
        # accuracy on permutated test set
        perm_accuracy, _ = mnb_compute(
            stats['test_feature_matrix'],
            np.random.permutation(df_test_ms1['reg']),
            mnb
        )
        
        bayes_results[txt][max_feats] = {
            'model': mnb,
            'train_accuracy': train_accuracy,
            'dev_accuracy': dev_accuracy,
            'test_accuracy': test_accuracy,
            'test_predictions': test_predictions,
            'perm_accuracy': perm_accuracy,
        }

In [0]:
for txt, sets in bayes_results.items():
    print(f"\nResults for {txt} texts.")
    for mf, stats in sets.items():
        print(f"Max features: {mf}",
              f"\tTrain Accuracy: {stats['train_accuracy']:.2f}",
              f"\tDev accuracy: {stats['dev_accuracy']:.2f}",
              f"\tTest accuracy: {stats['test_accuracy']:.2f}"
              f"\tPermutated testset accuracy: {stats['perm_accuracy']:.2f}")


Results for normal texts.
Max features: 1000 	Train Accuracy: 0.68 	Dev accuracy: 0.53 	Test accuracy: 0.50	Permutated testset accuracy: 0.07
Max features: 5000 	Train Accuracy: 0.79 	Dev accuracy: 0.59 	Test accuracy: 0.55	Permutated testset accuracy: 0.06
Max features: 10000 	Train Accuracy: 0.81 	Dev accuracy: 0.59 	Test accuracy: 0.56	Permutated testset accuracy: 0.09
Max features: 20000 	Train Accuracy: 0.81 	Dev accuracy: 0.56 	Test accuracy: 0.55	Permutated testset accuracy: 0.09

Results for transformed texts.
Max features: 1000 	Train Accuracy: 0.69 	Dev accuracy: 0.51 	Test accuracy: 0.51	Permutated testset accuracy: 0.06
Max features: 5000 	Train Accuracy: 0.76 	Dev accuracy: 0.56 	Test accuracy: 0.56	Permutated testset accuracy: 0.07
Max features: 10000 	Train Accuracy: 0.76 	Dev accuracy: 0.56 	Test accuracy: 0.54	Permutated testset accuracy: 0.08
Max features: 20000 	Train Accuracy: 0.73 	Dev accuracy: 0.53 	Test accuracy: 0.51	Permutated testset accuracy: 0.09


### 1.1: BOW

In [0]:
bow_results = {}

example_count = len(df_train)
class_count = len(label_encoder.classes_)

for txt, sets in ms1_data.items():
    bow_results[txt] = {}
    for max_feats, stats in sets.items():
        bow_results[txt][max_feats] = {}
        for optimizer in ['sgd', 'adam']:
            print(f"Training BOW for {txt} text "
                  f"with params: optimizer = {optimizer}, "
                  f"count vectorizer max_features = {max_feats}") 
            
            train_fm = stats['train_feature_matrix']
            dev_fm = stats['dev_feature_matrix']
            test_fm = stats['test_feature_matrix']

            inp = Input(shape=(max_feats,))
            hidden = Dense(300, activation="relu")(inp)
            outp = Dense(class_count, activation='softmax')(hidden)
            model = Model(inputs=[inp], outputs=[outp])

            model.compile(optimizer=optimizer,
                          loss='sparse_categorical_crossentropy',
                          metrics=['accuracy'])

            mc = ModelCheckpoint(filepath='/tmp/bow_model.h5',
                                 monitor='val_loss',
                                 verbose=0,
                                 save_best_only=True,
                                 mode='auto')

            es = EarlyStopping(monitor='val_loss',
                               patience=10,
                               verbose=0,
                               restore_best_weights=True)
            
            hist = model.fit(train_fm.toarray(),
                             train_class_numbers,
                             batch_size=32,
                             verbose=0,
                             epochs=100,
                             callbacks=[mc, es],
                             validation_data=(dev_fm.toarray(), dev_class_numbers))
            
            bow_results[txt][max_feats][optimizer] = {
                'model': model,
                'hist': hist,
                'results': {}
            }
            
            for name, fm_cls in zip (['Train', 'Dev', 'Test'],
                                     [(train_fm, train_class_numbers),
                                      (dev_fm, dev_class_numbers),
                                      (test_fm, test_class_numbers)]):
                fm, cls_num = fm_cls
                preds = model.predict(fm.toarray())
                acc = (np.sum(np.equal(cls_num, np.argmax(preds, axis=1)))
                       / len(cls_num))
                bow_results[txt][max_feats][optimizer]['results'][name] = {
                    'accuracy': acc,
                    'preds': preds
                }
                
print("Done.")

Training BOW for normal text with params: optimizer = sgd, count vectorizer max_features = 1000
Training BOW for normal text with params: optimizer = adam, count vectorizer max_features = 1000
Training BOW for normal text with params: optimizer = sgd, count vectorizer max_features = 5000
Training BOW for normal text with params: optimizer = adam, count vectorizer max_features = 5000
Training BOW for normal text with params: optimizer = sgd, count vectorizer max_features = 10000
Training BOW for normal text with params: optimizer = adam, count vectorizer max_features = 10000
Training BOW for normal text with params: optimizer = sgd, count vectorizer max_features = 20000
Training BOW for normal text with params: optimizer = adam, count vectorizer max_features = 20000
Training BOW for transformed text with params: optimizer = sgd, count vectorizer max_features = 1000
Training BOW for transformed text with params: optimizer = adam, count vectorizer max_features = 1000
Training BOW for tran

In [0]:
for txt, sets in bow_results.items():
    print(f"\nResults for {txt} texts.")
    for mf, optim_stats in sets.items():
        for optim, stats in optim_stats.items():
            print(f"Max features: {mf}",
                  f"\tOptimizer: {optim.ljust(4, ' ')}",
                  f"\tTrain Accuracy: {stats['results']['Train']['accuracy']:.2f}",
                  f"\tDev accuracy: {stats['results']['Dev']['accuracy']:.2f}",
                  f"\tTest accuracy: {stats['results']['Test']['accuracy']:.2f}")


Results for normal texts.
Max features: 1000 	Optimizer: sgd  	Train Accuracy: 0.69 	Dev accuracy: 0.49 	Test accuracy: 0.50
Max features: 1000 	Optimizer: adam 	Train Accuracy: 0.83 	Dev accuracy: 0.51 	Test accuracy: 0.51
Max features: 5000 	Optimizer: sgd  	Train Accuracy: 0.82 	Dev accuracy: 0.55 	Test accuracy: 0.53
Max features: 5000 	Optimizer: adam 	Train Accuracy: 0.92 	Dev accuracy: 0.56 	Test accuracy: 0.55
Max features: 10000 	Optimizer: sgd  	Train Accuracy: 0.86 	Dev accuracy: 0.56 	Test accuracy: 0.54
Max features: 10000 	Optimizer: adam 	Train Accuracy: 0.96 	Dev accuracy: 0.57 	Test accuracy: 0.57
Max features: 20000 	Optimizer: sgd  	Train Accuracy: 0.87 	Dev accuracy: 0.56 	Test accuracy: 0.54
Max features: 20000 	Optimizer: adam 	Train Accuracy: 0.98 	Dev accuracy: 0.58 	Test accuracy: 0.56

Results for transformed texts.
Max features: 1000 	Optimizer: sgd  	Train Accuracy: 0.72 	Dev accuracy: 0.54 	Test accuracy: 0.52
Max features: 1000 	Optimizer: adam 	Train Acc

### 1.2: RNN

In [0]:
tokenizer = Tokenizer(num_words=100000, lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(texts['normal']['train'])
train_seq = tokenizer.texts_to_sequences(texts['normal']['train'])
dev_seq = tokenizer.texts_to_sequences(texts['normal']['dev'])
test_seq = tokenizer.texts_to_sequences(texts['normal']['test'])

In [0]:
train_lens = [len(s) for s in train_seq]
dev_lens = [len(s) for s in dev_seq]
test_lens = [len(s) for s in test_seq]

train_max_len, train_min_len, train_mean_len = max(train_lens), min(train_lens), int(np.mean(train_lens))
dev_max_len, dev_min_len, dev_mean_len = max(dev_lens), min(dev_lens), int(np.mean(dev_lens))
test_max_len, test_min_len, test_mean_len = max(test_lens), min(test_lens), int(np.mean(test_lens))

In [0]:
print(train_max_len, dev_max_len, test_max_len)
print(train_min_len, dev_min_len, test_min_len)
print(train_mean_len, dev_mean_len, test_mean_len)

81038 13906 81724
0 19 0
584 530 564


In [0]:
train_seq = pad_sequences(train_seq,
                          250,
                          padding='post', 
                          truncating='post',
                          value=0)

dev_seq = pad_sequences(dev_seq,
                        250,
                        padding='post', 
                        truncating='post',
                        value=0)

test_seq = pad_sequences(test_seq,
                         250,
                         padding='post', 
                         truncating='post',
                         value=0)

In [0]:
def build_rnn_model(RNN_class, 
                    sequence_length, 
                    vocab_size,
                    num_classes,
                    embedding_dim=250,
                    rnn_units=50):
    
    input_ = Input(shape=(sequence_length,))
    embedding = Embedding(vocab_size, embedding_dim)(input_)
    # return_sequences=False is the default
    rnn = RNN_class(rnn_units, return_sequences=False)(embedding)
    output = Dense(num_classes, activation='softmax')(rnn)
    return Model(inputs=[input_], outputs=[output])  

In [0]:
lstm_model = build_rnn_model(LSTM, 250, tokenizer.num_words, len(train_labels), embedding_dim=250)
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_history = lstm_model.fit(train_seq, train_class_numbers, epochs=3, batch_size=1, validation_split=0.1)

Train on 4765 samples, validate on 530 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [0]:
"""
simple_model = build_rnn_model(SimpleRNN, train_max_len, tokenizer.num_words, len(train_labels))
simple_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
simple_history = simple_model.fit(train_seq, train_class_numbers, epochs=3, batch_size=1, validation_split=0.1, verbose=1)
"""
pass