#Pipeline for classifiying texts

_To do: installation instructions_

* numpy
* scikit learn
* joblib
* python-joblib
* scipy

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (18, 5)
plt.rcParams['font.family'] = 'sans-serif'

The classifier expects input text files of containing:  
`sentence id[tab]sentence[tab]None`  
etc.

The sentence should be tokenized, and tokens should be separated by a space.

It is best to have a single file for each text for which labels should be predicted.

The text files should be put together in a single directory.

_To do: make notebook with pipeline for converting text files to files that can be used for prediction._

In [2]:
import os

# path to the input data
data_dir = '/home/jvdzwaan/data/embem/txt/corpus_big-for_prediction/'

# specify the path where output should be written
out_dir = '/home/jvdzwaan/data/tmp/bla'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# classifier file
classifier = '/home/jvdzwaan/data/classifier/classifier.pkl'

# train file
train_file = '/home/jvdzwaan/data/embem_ml/multilabel-normalized/all.txt'

In [3]:
# load utility functionality
from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, hamming_loss, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC


def load_data(data_file):
    data = [ln.rsplit(None, 1) for ln in open(data_file)]

    X_data, Y_data = zip(*data)

    return X_data, Y_data


def get_data(train_file, test_file):
    X_train, Y_train = load_data(train_file)
    X_train = [ln.split('\t')[1] for ln in X_train]
    X_test, Y_test = load_data(test_file)
    X_test = [ln.split('\t')[1] for ln in X_test]

    mlb = MultiLabelBinarizer()
    Y_train = [set(s.split('_')) - {'None'} for s in Y_train]
    Y_test = [set(s.split('_')) - {'None'} for s in Y_test]
    Y_train = mlb.fit_transform(Y_train)
    Y_test = mlb.transform(Y_test)

    return X_train, X_test, Y_train, Y_test, mlb.classes_


def split(s):
    return s.split()

In [5]:
from sklearn.externals import joblib
import codecs

# load classifier
clf = joblib.load(classifier)

text_files = [fi for fi in os.listdir(data_dir) if fi.endswith('.txt')]
for i, text_file in enumerate(text_files):
    in_file = os.path.join(data_dir, text_file)
    print('{} of {}'.format(i+1, len(text_files)), text_file)

    # load data
    X_train, X_data, Y_train, Y_data, classes_ = get_data(train_file, in_file)

    # classifiy
    pred = clf.predict(X_data)

    # save results
    out_file = os.path.join(out_dir, text_file)

    X_data_with_ids, Y_data = load_data(in_file)

    with codecs.open(out_file, 'wb', 'utf8') as f:
        for x, y in zip(X_data_with_ids, pred):
            f.write(u'{}\t{}\n'.format(x.decode('utf8'),
                                       '_'.join(classes_[y]) or 'None'))

    print

1 of 149 corn001dood01.txt
2 of 149 plui001verl01.txt
3 of 149 bred001gria01.txt
4 of 149 koni001twee01.txt
5 of 149 vos_002aran03.txt
6 of 149 stee033beon01.txt
7 of 149 feit007patr01.txt
8 of 149 hoev003rech01.txt
9 of 149 zand008deme01.txt
10 of 149 raci001hest01.txt
11 of 149 aren001joan01.txt
12 of 149 wild007swer01.txt
13 of 149 hoev003isab01.txt
14 of 149 vond001pete01.txt
15 of 149 moli015bela01.txt
16 of 149 bran002vein02.txt
17 of 149 gaet001ontm01.txt
18 of 149 corn001cid_02.txt
19 of 149 haps002soph02.txt
20 of 149 scha003voor01.txt
21 of 149 boon045leid02.txt
22 of 149 bidl001fabi01.txt
23 of 149 lang020gava03.txt
24 of 149 moli015scho02.txt
25 of 149 stey002geve01.txt
26 of 149 zeer001eers01.txt
27 of 149 bidl001kare02.txt
28 of 149 wild007abra01.txt
29 of 149 hoev003dood02.txt
30 of 149 elst004jano01.txt
31 of 149 bidl001vert01.txt
32 of 149 foss005manl01.txt
33 of 149 tijs003merk01.txt
34 of 149 bern001athi01.txt
35 of 149 croi003meid01.txt
36 of 149 corn001hora02.txt
3

The next step is to look at the results!

_To do: pipeline for showing/visualizing results_