# Text classification


## Solution of https://github.com/apohllo/nlp/blob/master/8-classification.md

In [241]:
import regex
from os import listdir
from os.path import isfile, join
from typing import List, Set, Dict, Tuple, Optional
import shutil
from random import sample
from random import choice
import numpy as np
import math
import pandas as pd


In [152]:
def load_bills(directory_path: str):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            content = input_file.read()
            content_1 = clean(content)
            if is_ammending(content_1):
                shutil.copy(f"{directory_path}/{file}", "classifications/amending/", follow_symlinks=True)
            else:
                shutil.copy(f"{directory_path}/{file}", "classifications/normal/", follow_symlinks=True)


In [153]:
def is_ammending(content: str):
    first_part = get_first_part(content)
    return contains_ammendment(first_part)
    

In [154]:
def get_first_part(content: str):
    splitted = content.split(' ')
    again = ""
    for word in splitted:
        if word == "Art.":
            break
        else:
            again += word + " " 
    return again

In [162]:
def get_second_part(content: str):
    splitted = content.split(' ')
    again = ""
    take = False
    for word in splitted:
        if word == "Art.":
            take = True
        if take:
            again += word + " " 
    return again

In [171]:
def clean(content: str):
    content_1 = content.replace(u'\s', u' ')
    content_2 = content_1.replace(u'\t', u' ')
    content_3 = content_2.replace(u'\xa0', u' ')
    return content_3

In [168]:
def contains_ammendment(text:str):
    return regex.findall('[\s]*o[\s]*zmianie[\s]*ustawy[\s]*', get_first_part(text)) != []

In [172]:
def remove_title(content:str):
    return get_second_part(content)

In [173]:
bills = []

In [209]:
def label_bills(directory_path: str, label:int):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            content = input_file.read()
            content_1 = clean(content)
            content_2 = get_second_part(content_1)
            if len(content_2.split('\n')) < 10:
                print(content_2)
            else:
                bill_tuple = (content_2, get_10_percent(content_2),get_10_lines(content_2), get_line(content_2))
                bills.append(bill_tuple)


In [210]:
def get_10_percent(content:str):
    lines = content.split('\n')
    n_lines = len(lines)
    ten_percent = n_lines // 10
    ten_percent_lines = sample(lines, ten_percent)
    separator = '\n'
    return separator.join(ten_percent_lines)   

In [211]:
def get_10_lines(content: str):
    lines = content.split('\n')
    ten_lines = sample(lines, 10)
    separator = '\n'
    return separator.join(ten_lines)  


In [212]:
def get_line(content: str):
    lines = content.split('\n')
    return choice(lines)

In [242]:
df = pd.DataFrame(bills)

In [243]:
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [251]:
import pickle
def serialize(name, data):
    with open(name, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
def deserialize(name):
    with open(name, 'rb') as handle:
        return pickle.load(handle)

In [252]:
serialize('train', train)
serialize('validate', validate)
serialize('test', test)

In [312]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


In [277]:
def create_vectors(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X

In [315]:
def grid_search(train_x, train_y, validate_x, validate_y, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=3, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print (grid_search_tune.best_estimator_.steps)

    # measuring performance on test set
    print ("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(validate_x)
    return best_clf, predictions

In [342]:
def tf_idf_svm(train_text, train_label, validate_text, validate_label):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    ])
    parameters = {
        'tfidf__max_df': (0.25, 0.5, 0.75),
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        "clf__estimator__C": [0.01, 0.1, 1],
        "clf__estimator__class_weight": ['balanced', None],
    }
    clf, predictions = grid_search(train_text, train_label, validate_text, validate_label, parameters, pipeline)
    pr_ = np.array(predictions)
    val_ = np.array(validate_label)
    print(sum(pr_==val_)/ len(pr_))
    return clf

In [343]:
clasiffier_full = tf_idf_svm(train[1], train[5], validate[1], validate[5])

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    4.7s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   29.7s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   50.2s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  1.3min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  2.4min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  4.5min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  5.7min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:  8.0min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.1, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]
Applying best classifier on test data:
0.8914285714285715


In [376]:
from sklearn.metrics import precision_recall_fscore_support

In [377]:
def precision_recall_f(clasifier, data):
    predicted = clasifier.predict(data)
    return precision_recall_fscore_support(test[5], predicted, average='binary')

In [378]:
precision_recall_f(clasiffier_full, test[1]) 

(0.7912087912087912, 0.8674698795180723, 0.8275862068965518, None)

In [346]:
clasifier_10_precent = tf_idf_svm(train[2], train[5], validate[2], validate[5])

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    2.7s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    5.0s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    7.3s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   10.2s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   13.7s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   17.0s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   21.2s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:   26.2s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:   31.4s
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:   36.4s
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:   42.9s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]
Applying best classifier on test data:
0.7942857142857143


In [379]:
precision_recall_f(clasifier_10_precent, test[2]) 

(0.6836734693877551, 0.8072289156626506, 0.7403314917127072, None)

In [348]:
clasifier_10 = tf_idf_svm(train[3], train[5], validate[3], validate[5])

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.1371s.) Setting batch_size=2.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done  18 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Done  32 tasks      | elapsed:    2.5s
[Parallel(n_jobs=3)]: Done  46 tasks      | elapsed:    3.7s
[Parallel(n_jobs=3)]: Done  64 tasks      | elapsed:    5.2s
[Parallel(n_jobs=3)]: Done  82 tasks      | elapsed:    6.6s
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:    8.8s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.01, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]
Applying best classifier on test data:
0.7885714285714286


In [380]:
precision_recall_f(clasifier_10, test[3]) 

(0.6637168141592921, 0.9036144578313253, 0.7653061224489797, None)

In [350]:
clasifier_1 = tf_idf_svm(train[4], train[5], validate[4], validate[5])

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Batch computation too fast (0.0567s.) Setting batch_size=6.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done  42 tasks      | elapsed:    0.8s


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.25, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]
Applying best classifier on test data:
0.64


[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:    1.7s finished


In [381]:
precision_recall_f(clasifier_1, test[4]) 

(0.46496815286624205, 0.8795180722891566, 0.6083333333333333, None)

In [460]:
import fasttext

In [472]:
def fastText(train_text, train_label, validate_text, validate_label, name):
    with open(name, 'w+') as file:
        for index, content in enumerate(train_text):
            clean =  content.replace('\n', " ")
            file.write(f"__label__{np.array(train_label)[index]} {clean}\n")
    classifier_f = fasttext.supervised(name, f'model_text_{name}')
    return classifier_f

In [473]:
clasifier_full_fasttext = fastText(train[1], train[5], validate[1], validate[5], 'full.train')

In [474]:
predicted = clasifier_full_fasttext.predict(test[1])

In [475]:
def precision_recall_f_fasttext(clasifier_f, data):
    predicted_f = clasifier_f.predict(data)
    predicted_numbers = list(map(lambda x: int(x[0]), predicted_f))
    return precision_recall_fscore_support(test[5], predicted_numbers, average='binary')
    

In [476]:
precision_recall_f_fasttext(clasifier_full_fasttext, test[1])

(0.4715909090909091, 1.0, 0.640926640926641, None)

In [477]:
clasifier_10_percent_fasttext = fastText(train[2], train[5], validate[2], validate[5], '10percent.train')

In [478]:
precision_recall_f_fasttext(clasifier_10_percent_fasttext, test[2])

(0.4715909090909091, 1.0, 0.640926640926641, None)

In [479]:
clasifier_10 = fastText(train[3], train[5], validate[3], validate[5], '10.train')

In [480]:
precision_recall_f_fasttext(clasifier_10, test[3])

(0.4715909090909091, 1.0, 0.640926640926641, None)

In [481]:
clasifier_1 = fastText(train[4], train[5], validate[4], validate[5], '1.train')

In [482]:
precision_recall_f_fasttext(clasifier_1, test[3])

(0.4715909090909091, 1.0, 0.640926640926641, None)