In [139]:
import numpy as np
import json
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn import metrics

# Load the data

In [140]:
source_corpus = []
source_labels = []
files = [
    'kamerstukken_topics-multi_20160602_20161201.json',
    'kamerstukken_topics-multi_20160101_20160601.json',
    'kamerstukken_topics-multi_20170101_20170301.json',
    'kamerstukken_topics-multi_20170302_20170601.json',
    'kamerstukken_topics-multi_20170602_20180101.json',
    'kamerstukken_topics-multi_20180101_20180401.json'
]

for file in files:
    data = json.load(open('../../data_resources/topics/kamerstukken/{}'.format(file)))
    for obj in data:
        if len(obj['categories']) > 0:
            source_corpus.append(obj['content'])
            source_labels.append(obj['categories'])


print(len(source_corpus))
print(len(source_labels))

12160
12160


In [141]:
unlabeled_corpus = []
files = [
    'poliflw_target_unlabeled.json'
]

for file in files:
    data = json.load(open('../../data_resources/topics/kamerstukken/{}'.format(file)))
    for obj in data:
        unlabeled_corpus.append(obj['content'])


print(len(unlabeled_corpus))

2500


In [142]:
target_corpus = []
target_labels = []
files = [
    'poliflw_target_labeled_data_0.95.json'
]

for file in files:
    data = json.load(open('../../data_resources/topics/kamerstukken/{}'.format(file)))
    for obj in data:
#         source_corpus.append(obj['content'])
#         source_labels.append(obj['categories'])
        
        target_corpus.append(obj['content'])
        target_labels.append(obj['categories'])


print(len(target_corpus))
print(len(target_labels))

179
179


# Define helper functions and constants

In [148]:
def predict_multi_always(clf, x):
    y_pred = clf.predict(x)
    y_prob = clf.predict_proba(x)
    
    for sample_index in range(0,len(y_pred)):
        most_probable_label = np.where(y_prob[sample_index] == max(y_prob[sample_index]))[0][0]
        y_pred[sample_index][most_probable_label] = 1
        
    return y_pred

def train_clf(corpus, labels):
    transformer = TfidfVectorizer(smooth_idf=False, min_df=0.001, max_df=0.08, sublinear_tf=True, ngram_range=(1,1))
    mlb = MultiLabelBinarizer()
    
    X = transformer.fit_transform(corpus)
    y = mlb.fit_transform(labels)
    
    clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, random_state=42, max_iter=10)
    clf = OneVsRestClassifier(clf).fit(X, y)
    
    return transformer, mlb, clf

def merge_training_data(original_corpus, unlabeled_corpus, original_labels, unlabeled_predictions):
    new_corpus = []
    new_labels = []
    
    for i in range(0,len(original_corpus)):
        new_corpus.append(original_corpus[i])
        new_labels.append(original_labels[i])
        
    for i in range(0,len(unlabeled_corpus)):
        new_corpus.append(unlabeled_corpus[i])
        
        labels = []
        for pred in unlabeled_predictions[i]:
            labels.append(pred)
        new_labels.append(labels)
    
    return new_corpus, new_labels

def evaluate_target_domain(transformer, mlb, clf, corpus, labels):
    X = transformer.transform(corpus)
    y = mlb.transform(labels)
    
    y_pred = clf.predict(X)
    
    p = precision_score(y, y_pred, average='micro')
    r = recall_score(y, y_pred, average='micro')
    f1 = f1_score(y, y_pred, average='micro')

    print('P score: {0:0.2f}'.format(p))
    print('R score: {0:0.2f}'.format(r))
    print('F1 score: {0:0.2f}'.format(f1))

# Train the classifier
1. Train a classifier on source corpus
2. Classify unlabeled data
3. Add uonlabeled corpus + predictions to labeled data
4. Retrain classifier

In [157]:
transformer_initial, mlb_initial, clf_initial = train_clf(source_corpus, source_labels)

evaluate_target_domain(transformer_initial, mlb_initial, clf_initial, target_corpus, target_labels)

P score: 0.44
R score: 0.08
F1 score: 0.14


In [163]:
y_pred_unlabeled = predict_multi_always(clf_initial, transformer_initial.transform(unlabeled_corpus))

y_pred_unlabeled_labels = mlb_initial.inverse_transform(y_pred_unlabeled)

print(y_pred_unlabeled_labels[-1])

('Natuur en milieu | Energie',)


In [164]:
new_corpus, new_labels = merge_training_data(source_corpus, unlabeled_corpus, source_labels, y_pred_unlabeled_labels)

In [165]:
transformer_new, mlb_new, clf_new = train_clf(new_corpus, new_labels)
evaluate_target_domain(transformer_new, mlb_new, clf_new, target_corpus, target_labels)

P score: 0.25
R score: 0.08
F1 score: 0.12


In [166]:
t, m, c = train_clf(source_corpus, source_labels)

y_pred = predict_multi_always(c, t.transform(unlabeled_corpus))
y_pred_labels = m.inverse_transform(y_pred_unlabeled)
new_c, new_l = merge_training_data(source_corpus, unlabeled_corpus, source_labels, y_pred_labels)

t, m, c = train_clf(new_c, new_l)

KeyboardInterrupt: 

In [167]:
t, m, c = train_clf(source_corpus, source_labels)
evaluate_target_domain(t, m, c, target_corpus, target_labels)

for i in range(0,5):
    y_pred = predict_multi_always(c, t.transform(unlabeled_corpus))
    y_pred_labels = m.inverse_transform(y_pred_unlabeled)
    
    new_c, new_l = merge_training_data(source_corpus, unlabeled_corpus, source_labels, y_pred_labels)
    
    t, m, c = train_clf(new_c, new_l)
    
    evaluate_target_domain(t, m, c, target_corpus, target_labels)

P score: 0.44
R score: 0.08
F1 score: 0.14
P score: 0.25
R score: 0.08
F1 score: 0.12
P score: 0.25
R score: 0.08
F1 score: 0.12
P score: 0.25
R score: 0.08
F1 score: 0.12


KeyboardInterrupt: 

# Evaluate classifier on test data

P score: 0.28
R score: 0.20
F1 score: 0.24


# OLD
# -------

In [53]:
y_source = mlb.fit_transform(source_labels)
y_target = mlb.transform(target_labels)

In [54]:

X_source = transformer.fit_transform(source_corpus)
X_target = transformer.transform(target_corpus)

print(np.shape(X_source))

(12160, 14113)


In [55]:
clf = SGDClassifier(loss='log', penalty='l1', alpha=1e-6, random_state=42, max_iter=10)
clf = OneVsRestClassifier(clf).fit(X_source, y_source)

In [56]:
X_s = transformer.transform(unlabeled_corpus[0:2])

res = clf.predict(X_s)

for obj in res:
    if 1 in obj:
        print(obj)

In [57]:
semi_corpus = source_

P score: 0.28
R score: 0.20
F1 score: 0.24
