In [None]:
from glob import glob
from mair.pdf_parsing import parse
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy
from spacy.pipeline.functions import merge_entities
from sklearn.feature_extraction.text import CountVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from typing import List, Dict
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn import metrics
from joblib import dump, load
import seaborn as sns

tqdm.pandas()
DATASET_PATH = '../data/legal_docs_recognition/'
LEGAL_DOCS_PATH = os.path.join(DATASET_PATH, 'legal_docs.json')
NON_LEGAL_DOCS_PATH = os.path.join(DATASET_PATH, 'nonlegal_docs.json')

In [None]:
other_docs = glob('../data/policydemics/other/*/*.pdf')
singapore_docs = glob('../data/policydemics/singapore/*/*.pdf')

In [None]:
len(other_docs), len(singapore_docs)

In [None]:
def get_texts_from_pdfs(paths: List[str]) -> Dict[str,str]:
    """Extract text from pdfs under given paths. Returns maping path -> text """
    parsed_texts = dict()
    for path in tqdm(paths):
        try:
            parsed_pdf = parse(path)
            parsed_texts[path] = parsed_pdf.full_text
        except Exception as e:
            print(path, 'exception:', e)
    return parsed_texts

In [None]:
parsed_texts = get_texts_from_pdfs(other_docs + singapore_docs)

In [None]:
# split to legal docs and not legal docs
legal_docs = dict()
not_legal_docs = dict()
for original_path, text in parsed_texts.items():
    name = os.path.basename(original_path)[:-4]
    if "S_L" in original_path or "nS_L" in original_path:
        legal_docs[name] = text
    elif "S_nL" in original_path or "nS_nL" in original_path:
        not_legal_docs[name] = text

In [None]:
json.dump(legal_docs, open(LEGAL_DOCS_PATH, 'w'))
json.dump(not_legal_docs, open(NON_LEGAL_DOCS_PATH, 'w'))

In [None]:
def load_dataset():
    legal_docs_texts = json.load(open(LEGAL_DOCS_PATH, 'r')).values()
    nonlegal_docs_texts = json.load(open(NON_LEGAL_DOCS_PATH, 'r')).values()

    nonlegal_docs_texts
    df = pd.DataFrame(
        [{"text": text, "label": 1} for text in legal_docs_texts]
        + [{"text": text, "label": 0} for text in nonlegal_docs_texts]
    )
    return df

In [None]:
df = load_dataset()

In [None]:
df.label.hist()

In [None]:
train, test = train_test_split(df, stratify=df.label, random_state=123)

In [None]:
nlp = spacy.load("en_core_web_sm")

def get_processed_words(doc):
    words = [t.ent_type_ if t.ent_type_ else t.lemma_.lower() for t in doc if not t.is_stop and (t.is_alpha or t.ent_type_)]
    joined_words = ' '.join(words)
    return joined_words

def preprocess_text(texts):
    d = texts.str.replace('\n',' ')
    d = d.str.replace(' +', ' ')
    docs = d.progress_apply(nlp)
    docs = docs.progress_apply(merge_entities)
    out_texts = docs.progress_apply(get_processed_words)
    return out_texts

text_preprocessing = FunctionTransformer(func = preprocess_text)

## Preprocessing, model training

In [None]:
mutual_info_selector = SelectKBest(mutual_info_classif, k=10000)
recurse_importance_selector = RFECV(
    estimator=LogisticRegression(penalty="l1", solver="saga"),
    min_features_to_select=20,
    n_jobs=-1,
    verbose=True,
    step=10,
)
classifier = LogisticRegression(penalty="l2")

pipeline = Pipeline(
    [
        (
            "preprocessing",
            Pipeline(
                [
                    ("text_processor", text_preprocessing),
                    ("count_vectorizer", CountVectorizer()),
                ]
            ),
        ),
        (
            "feature_selection",
            Pipeline(
                [
                    ("mutual_info_selector", mutual_info_selector),
                    ("recurse_importance_selector", recurse_importance_selector),
                ]
            ),
        ),
        ("classifier", classifier),
    ]
)

In [None]:
X_train = train.text
y_train = train.label

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_train)

accuracy_score(y_train,y_pred)

In [None]:
X_test = test.text
y_test = test.label

pred_test = pipeline.predict(X_test)

In [None]:
accuracy_score(y_test,pred_test)

## Baseline

In [None]:
baseline = DummyClassifier().fit(X_train, y_train)
accuracy_score(y_test,baseline.predict(X_test))

## Importance

In [None]:
feature_names = np.array(
    pipeline.named_steps["preprocessing"]
    .named_steps["count_vectorizer"]
    .get_feature_names()
)[
    pipeline.named_steps["feature_selection"]
    .named_steps["mutual_info_selector"]
    .get_support()
][
    pipeline.named_steps["feature_selection"]
    .named_steps["recurse_importance_selector"]
    .get_support()
]

data = {
    "feature_names": feature_names,
    "feature_importance": pipeline.named_steps["classifier"].coef_[0],
}
fi_df = pd.DataFrame(data)

# Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=["feature_importance"], ascending=False, inplace=True)

# Define size of bar plot
plt.figure(figsize=(12, 18))
# Plot Searborn bar chart
sns.barplot(x=fi_df["feature_importance"], y=fi_df["feature_names"])

In [None]:
pred_proba_test = pipeline.predict_proba(X_test)
fpr, tpr, _ = metrics.roc_curve(y_test,  pred_proba_test[:,1])
plt.plot(fpr,tpr)

In [None]:
dump(pipeline, 'classification_pipeline.joblib') 

In [None]:
X_test[y_test!=pred_test][0]