In [2]:
import os
import json
import random
CORPUS_PATH = "leipzig-corpus"
TRAIN_FILE = "run-on-train.json"
TEST_FILE = "run-on-test.json"

def iter_islast(iterable):
    it = iter(iterable)
    prev = it.__next__()
    for item in it:
        yield prev, False
        prev = item
    yield (prev[:-1], prev[-1]), True

def extract_tokens_file(filename):
    tokens = []
    if filename.endswith("eng-eu_web_2015_10K-sentences.txt"):
        print("Processing: {}".format(filename))
        with open(filename, "r") as f:

            text = f.read().split("\n")
            res = []
            run_on_mode = False
            for line in text:
                if line:
                    line = line.split("\t")[1]
                    for token, is_last in iter_islast(line.split(" ")):
                        if is_last:
                            token1, token2 = token
                            #around 70% that it is run on
                            is_run_on = random.randint(1, 10) < 7
                            if is_run_on:
                                res.append([token1, True])
                                run_on_mode = True
                            else:
                                #this is the end of sentence
                                res.append([token1, False])
                                res.append([token2, False])
                                tokens.append(res)
                                res = []
                        else:
                            if run_on_mode:
                                if random.randint(0, 1): #this is 50% that run on is lower
                                    res.append([token.lower(), False])
                                    continue
                                run_on_mode = False
                            res.append([token, False])
    return tokens

with open(TRAIN_FILE, "w") as f:
    tokens = [extract_tokens_file("{}/{}".format(CORPUS_PATH, fl)) for fl in os.listdir(CORPUS_PATH)]
    tokens = [item for sublist in tokens for item in sublist]
    print("Extracted {} sentences".format(len(tokens)))
    print(tokens[:2])
    res = json.dumps(tokens, indent=4)
    f.write(res)

Processing: leipzig-corpus/eng-eu_web_2015_10K-sentences.txt
Extracted 4058 tokens
[[['01.08.2011', False], ['Public', False], ['call', False], ['for', False], ['standard', False], ['projects', False], ['No.', False], ['02/2009', False], ['.', False]], [['•', False], ['05.00', False], ['Energy', False], ['Rating', False], ['Introduce', False], ['energy', False], ['rating', False], ['models', False], ['for', False], ['selected', False], ['technologies', False], ['based', False], ['on', False], ['high', False], ['resolution', False], ['databases', False], ['in', False], ['the', False], ['Photovoltaic', False], ['Geographic', False], ['Information', False], ['System', False], ['(PVGIS),', False], ['also', False], ['in', False], ['collaboration', False], ['with', False], ['ENEA', True], ['06.00', False], ['Estimating', False], ['annual', False], ['farming', False], ['GHG', False], ['emissions', False], ['(including', False], ['Nitrous', False], ['Oxide', False], ['(N2O)', False], ['from', 

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    label = sent[i][1]

    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features, label

In [4]:
def get_tokens(f):
    features, labels = [], []
    with open(f, "r") as fl:
        for sent in  json.loads(fl.read()):
            for i in range(len(sent)):
                f, label = word2features(sent, i) 
                features.append(f)
                labels.append(label)
    return features, labels
train_features, train_labels = get_tokens(TRAIN_FILE)
test_features, test_labels = get_tokens(TEST_FILE)

In [5]:
train_features[:2]

[{'+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'public',
  'BOS': True,
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': '01.08.2011',
  'word[-2:]': '11',
  'word[-3:]': '011'},
 {'+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'call',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '-1:word.lower()': '01.08.2011',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'public',
  'word[-2:]': 'ic',
  'word[-3:]': 'lic'}]

In [6]:
from collections import Counter
print("test labels {}".format(Counter(test_labels)))
print("train labels {}".format(Counter(train_labels)))

test labels Counter({False: 4542, True: 155})
train labels Counter({False: 199687, True: 5941})


In [7]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
x_train = vec.fit_transform(train_features).toarray()
print(x_train)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
from sklearn.linear_model import LogisticRegression
ls = LogisticRegression()
ls.fit(x_train[:5000], train_labels[:5000])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
x_test = vec.transform(test_features).toarray()
y_pred = ls.predict(x_test)

In [13]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, y_pred))

             precision    recall  f1-score   support

      False       0.97      1.00      0.98      4542
       True       0.00      0.00      0.00       155

avg / total       0.94      0.97      0.95      4697



  'precision', 'predicted', average, warn_for)
