# Legalis SKOPS Predictor
#### Notebook used to create and publish an sckit-learn text classification modek for outcome prediction

## Libary and Dataset Import

In [27]:
# imports for utility
import pandas as pd
import matplotlib
import pickle
from pathlib import Path
from tempfile import mkdtemp, mkstemp
import os

#nltk imports for stop words
import nltk
nltk.download('stopwords')

#sklearn import for model creation
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

#imports to pull/push to/from huggingface hub and create an interactive pipeline
from sklearn.pipeline import Pipeline
import datasets as ds
from skops import card, hub_utils


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### preprocessing dataset again (stripping down and changing names)

In [2]:
dataset=ds.load_dataset("lennardzuendorf/legalis")
dataset=ds.concatenate_datasets([dataset['train'], dataset['test']])
dataset=dataset.remove_columns(['id', 'file_number', 'date', 'type', 'content', 'tenor','reasoning'])
dataset=dataset.rename_column('facts', 'text')
dataset=dataset.rename_column('winner', 'target')

print(dataset)

Downloading readme:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to C:/Users/lenna/.cache/huggingface/datasets/lennardzuendorf___parquet/lennardzuendorf--legalis-a2e01f8f34ef8796/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/79.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2660 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/141 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/lenna/.cache/huggingface/datasets/lennardzuendorf___parquet/lennardzuendorf--legalis-a2e01f8f34ef8796/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'target', 'label'],
    num_rows: 2801
})


In [3]:
def count_dist(dataset):

    counter_zero=0
    counter_one=0

    for case in dataset:
        if case['label']==1:
            counter_one+=1
        elif case['label']==0:
            counter_zero+=1

    dist=[round(counter_zero/(counter_zero+counter_one),2),round(counter_one/(counter_zero+counter_one),2)]

    data={'Verklagte*r': [counter_zero, dist[0]], 'Klaeger*in': [counter_one, dist[1]]}
    index=['case nmb.', 'case dist.']

    return pd.DataFrame(data=data, index=index)

## Predifining Function to run several times with different dataset sizes
#### function for dataframe creation with variable size and splitting into test, train

In [3]:
def create_dataset(size, test_size):

    df_dataset=pd.DataFrame(dataset[:size])
    df_train, df_test = train_test_split(df_dataset, test_size=test_size)

    return df_train, df_test

#### function to vectorize words and get rid of german stopwords with nltk

In [4]:
from nltk.corpus import stopwords
stop_words=stopwords.words('german')

def vectorize_words_stop(df_train, df_test):

    vec = CountVectorizer(
        ngram_range=(1, 3), 
        stop_words=stop_words,
    )

    X_train = vec.fit_transform(df_train.text)
    X_test = vec.transform(df_test.text)

    y_train = df_train.target
    y_test = df_test.target

    return X_train, X_test, y_train, y_test

#### scikit-learn multimodal model run block and returning a classification report as dictionary

In [5]:
def run_multimodal(X_train, y_train, X_test, y_test):
    
    nb = MultinomialNB()
    nb.fit(X_train, y_train)

    predict = nb.predict(X_test)
    return classification_report(y_test, predict, output_dict=True)

In [37]:
def run_forest(X_train, y_train, X_test, y_test):

    rf = RandomForestClassifier(max_depth=2, random_state=0)
    rf.fit(X_train, y_train)

    predict = rf.predict(X_test)
    return  classification_report(y_test, predict, output_dict=True)

## Running Model with different dataset sizes
#### function to run model with different sizes or with/without stop words and extract stats from it

In [30]:
def model_runner(run_list, mode):
    
    output=[]

    for run in run_list:
        print("running with "+str(run)+" cases")

        df_train, df_test = create_dataset(run, 0.2)
        X_train, X_test, y_train, y_test = vectorize_words_stop(df_train, df_test)
        if mode == "forest":
            report = run_forest(X_train, y_train, X_test, y_test)
        elif mode == "multimodal":
            report = run_multimodal(X_train, y_train, X_test, y_test)
        else:
            raise Exception("mode not defined")
        output.append(report)
    
    return output

def stat_extractor(run_stats, run_nmbs):
    data={}
    index=['precision Klaeger*in', 'precision Verklagte*r', 'overall accuracy', 'macro avg', 'weighted avg']
    i=0

    while i<len(run_stats):
        df=pd.DataFrame.from_dict(run_stats[i])
        values = df.values[:1][0].round(decimals=3)
        run_data={"run with "+str(run_nmbs[i]): values}
        data.update(run_data)
        i=+1

    return pd.DataFrame(data=data, index=index)

In [44]:
run_nmbs=[2800]
run_stats=model_runner(run_nmbs, mode="forest")
run_stats=stat_extractor(run_stats, run_nmbs)

display(run_stats)

running with 2800 cases


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,run with 2800
precision Klaeger*in,0.0
precision Verklagte*r,0.595
overall accuracy,0.595
macro avg,0.297
weighted avg,0.354


## Making best model available on the huggingface hub
#### creating a pipeline with sklearn pipeline

In [7]:
model = Pipeline(
    [
        ("count", CountVectorizer(ngram_range=(1, 3), stop_words=stop_words,)),
        ("clf", MultinomialNB()),
    ]
)

df_train, df_test = create_dataset(2801, 0.05)

X_train, X_test, y_train, y_test = train_test_split(
    df_train.text, df_train.target, test_size=0.05, random_state=42
)

model.fit(X_train, y_train)

#### creating a model card and some statistics to go with it

In [8]:
_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")

with open(pkl_name, mode="bw") as f:
    pickle.dump(model, file=f)

local_repo = mkdtemp(prefix="skops-")

hub_utils.init(
    model=pkl_name,
    requirements=[f"scikit-learn={sklearn.__version__}"],
    dst=local_repo,
    task="text-classification",
    data=X_test,
)

TypeError: Object of type Series is not JSON serializable