In [None]:
from datetime import datetime
import random
import time

import numpy as np
import pandas as pd

import re

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    matthews_corrcoef,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    MultiLabelBinarizer,
    StandardScaler,
)
from sklearn.svm import LinearSVC


In [None]:
court = 'hr' 


In [None]:
def get_date(x):
    full_date = datetime.strptime(x, '%Y-%m-%d')
    return full_date

In [None]:
def get_court_citations_amount(x, year):

    ecli_list = []
    c=0

    if x is not np.nan:
        x = eval(x)        
        for reference in x:
            ecli_list.append(reference["target_ecli"])
        
        for ecli in ecli_list:
            year_ecli = int(ecli.split(':')[3])
            
            if year_ecli <= year:
                c +=1

    else:
        return 0
    
    return c
    


In [None]:
def get_formal_citations(x):
    
    c = 0
    
    if x is not np.nan:
        x = eval(x)
        for reference in x:
            if "eerdereaanleg" in reference["type"]:
                c += 1
    else: 
        return 0
    
    return c

In [None]:
#ADAPTED FROM Schepers et al. (2023)
def get_court_citations(x, chosen_court, year):
    """ Count the number of occurrences of chosen_court in x, which is a list of references in dictionary format. """
    #not neccesary to exclude eerdere aanleg because they are never selected, only HR etc.
    c = 0
    ecli_list = []
    
    if x is not np.nan:
        x = eval(x)
        for reference in x:
            ecli_list.append(reference["target_ecli"])
    
        for ecli in ecli_list:
            court_name = ecli.split(':')[2]
            year_ecli = int(ecli.split(':')[3])
            
            if court_name == chosen_court and year_ecli <= year:
                c +=1
    else: 
        return 0
    
    return c

In [None]:
def get_incoming_citations(x, year):
    
    
    c = 0
    
    if x is not np.nan:
        x = eval(x)
        for reference in x:
            ecli = reference["target_ecli"]
            year_ecli = int(ecli.split(':')[3])
            
            if "latereaanleg" not in reference["type"] and year_ecli >= year :
                c += 1
        
    else: 
        return 0
    
    return c



In [None]:
def get_outgoing_legislations(x):
    c = 0
    
    if x is not np.nan:
        x = eval(x)
        for legislation in x:
            c += 1
    else: 
        return 0
    
    return c

In [None]:
#ADAPTED FROM Schepers et al. (2023)

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """ Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note that input X has to be a `pandas.DataFrame`. """

    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X: pd.DataFrame, y=None):
        for i in range(X.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame):
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

    def get_feature_names_out(self):
        return self.classes_


In [None]:
#ADAPTED FROM Schepers et al. (2023)
def remove_uncommon(values, col_name):
    """ Combine values into larger categories for identifying the most informative features. `keep_list` contains all values that occur in more than 1% of the rows. """

    VALUE_MAP = {
        "Cassatie in het belang der wet": "Cassatie",
        "Voorlopige voorziening+bodemzaak": "Voorlopige voorziening",
    }
    values = [values]
    x = [v.strip() for v in values]
    new = ""
    
    if col_name == 'procedure':
        global court
        if court == 'hr':
            keep_list = ['Cassatie', 'Cassatie in het belang der wet', 'Artikel 81 RO-zaken']
        elif court == 'rb':
            keep_list = ['Eerste aanleg - enkelvoudig', 'Eerste aanleg - meervoudig']
        elif court == 'rvs':
            keep_list = ['Hoger beroep', 'Eerste aanleg - meervoudig', 'Eerste aanleg - enkelvoudig', 'Voorlopige voorziening']
        else:
            raise AssertionError("Unknown court")
            
        for p in x:
            if (p_converted := VALUE_MAP.get(p, p)) in keep_list:
                new += (p_converted)
            elif p == '-':
                pass
            else:
                new += ('Other procedure')
                
    elif col_name == 'subject':
        for r in x:
            r = r.strip()
            
            # Only include the main part of the law area
            if r != 'Internationaal publiekrecht' or r != '-':
                new += (r.split(';')[0])
            else:
                new += 'other'

    return new


In [None]:
def get_length(string):
    cleaned_string = re.sub(r'\n+', ' ', string)
    cleaned_string_list = cleaned_string.split()
    length = len(cleaned_string_list)
    return length

In [None]:
def get_first_citation_dif(x, year):
    
    ecli_years = []
    
    if x is not np.nan:
        x = eval(x)
        for reference in x:
            ecli = reference["target_ecli"]
            year_ecli = int(ecli.split(':')[3])
            if "latereaanleg" not in reference["type"] and year_ecli >= year :
                ecli_years.append(year_ecli)

        if len(ecli_years) > 0:
            first_citation = min(ecli_years)   
            first_citation_dif = first_citation - year
            
            return first_citation_dif

        else:
            return np.nan

        
        
    else: 
        return np.nan
        

In [None]:
# Load data 

start = time.time()
print('Loading dataframe...')
df = pd.read_csv("D:\DSS D-schijf\Thesis\data\HR_rechtspraak_metadata_citations_full.csv")
print(f'Done loading dataframe in {time.time() - start} seconds.')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#Convert dates to datetime
df.loc[:,'date_decision'] = df.loc[:,'date_decision'].apply(get_date)

In [None]:
#Add year column
df.loc[:,'year'] = df.loc[:,'date_decision'].apply(lambda x: x.year)

In [None]:
#Add first_citation_dif_column
df.loc[:,'first_cit_dif'] = df.apply(lambda x: get_first_citation_dif(x['citations_incoming'], x['year']), axis = 1)


In [None]:
# Create plot first citation distribution

# Convert list to DataFrame
df_graph = pd.DataFrame()
df_graph['first_cit_dif'] = df.loc[:,'first_cit_dif']
bins = [0, 1, 2, 3, 5, 7, 9 ,float('inf')]  # float('inf') represents infinity for the upper bound
#labels = ['[0, 512]','[513, 1024]','[1024, 2048]', '[2048, 64784]']

df_graph['Category'] = pd.cut(df_graph['first_cit_dif'], bins=bins)#, labels=labels)


# Count the occurrences of each category
category_counts = df_graph['Category'].value_counts().sort_index()
category_counts = category_counts.cumsum()
category_counts = category_counts.reset_index()
category_counts.columns = ['Category', 'Count']  # Rename columns for clarity

# Create a bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Category', y='Count', data=category_counts, color='#0a6ca8')

for i, v in enumerate(category_counts['Count']):
    plt.text(i, v + 0.1, str(v), ha='center', va='bottom', fontsize=10)


plt.xlabel('Years since ruling date')
plt.ylabel('Count')
plt.title('Distribution of first incoming citation for Supreme Court rulings')
plt.gca().xaxis.set_ticklabels(['1', '2', '3', '4 - 5', '6 - 7', '8 - 9', '10 - inf'])  # Set custom labels

#plt.xticks(rotation=30)  # Rotate x labels for better visibility
plt.show()

#plt.savefig("./figs/Distribution_tokens_HR")

In [None]:
#sort dataframe by date
df = df.sort_values(by=['date_decision'])

In [None]:
# Reset index
df.reset_index(drop=True, inplace=True)

In [None]:
# Sort dataframe by date and remove files that haven't existed for 5 years
df = df.loc[(df['date_decision'] < datetime(2019, 4, 1))]

In [None]:
df.shape

In [None]:
df

In [None]:
#Inspect missing values
df.isnull().sum()

In [None]:
#inspect values of subject
df.loc[:,'subject'].value_counts()

In [None]:
#Convert subject to less categories
df.loc[:,'subject'] = df.loc[:,'subject'].replace(np.nan, '-')
df.loc[:,'subject'] = df.loc[:,'subject'].map(lambda x: remove_uncommon(x, 'subject'))

In [None]:
df.loc[:,'subject'].value_counts()

In [None]:
#Remove 1 missing value
df.drop(df[df['subject'] == '-'].index, inplace=True)

In [None]:
#Remove 1 wrong value
df.drop(df[df['subject'] == 'Internationaal publiekrecht'].index, inplace=True)

In [None]:
df.loc[:,'procedure'].value_counts()

In [None]:
# Group the least frequent columns together
df.loc[:,'procedure'] = df.loc[:,'procedure'].replace(np.nan, '-')
df.loc[:,'procedure'] = df.loc[:,'procedure'].map(lambda x: remove_uncommon(x, 'procedure'))


In [None]:
df.loc[:,'procedure'].value_counts()

In [None]:
#incoming citations
df.loc[:,'cit_in_count'] = df.apply(lambda x: get_incoming_citations(x['citations_incoming'], x['year']), axis = 1)
df.loc[:,'cit_in_binary'] = np.where(df.loc[:,'cit_in_count'] > 0,1,0)

# outgoing citations Create columns with counts for specific courts
df.loc[:,'cit_out_count'] = df.apply(lambda x: get_court_citations_amount(x['citations_outgoing'], x['year']), axis=1)

df.loc[:,'cit_out_sc_count'] = df.apply(lambda x: get_court_citations(x['citations_outgoing'],'HR', x['year']), axis=1)
df.loc[:,'cit_out_cs_count'] = df.apply(lambda x: get_court_citations(x['citations_outgoing'],'RVS', x['year']), axis=1)
df.loc[:,'cit_out_cbb_count'] = df.apply(lambda x: get_court_citations(x['citations_outgoing'],'CBB', x['year']), axis=1)
df.loc[:,'cit_out_crvb_count'] = df.apply(lambda x: get_court_citations(x['citations_outgoing'],'CRVB', x['year']), axis=1)


df.loc[:,'cit_out_supremes_count'] = df.loc[:,'cit_out_cbb_count'] + df.loc[:,'cit_out_crvb_count'] + df.loc[:,'cit_out_cs_count'] + df.loc[:,'cit_out_sc_count']
df.loc[:,'cit_out_not_supremes_count'] = df.loc[:,'cit_out_count'] - df.loc[:,'cit_out_supremes_count']

#formal relations
df.loc[:,'cit_phr_count'] = df.apply(lambda x: get_court_citations(x['citations_outgoing'],'PHR', x['year']), axis=1)

df.loc[:,'cit_formal_count'] = df.loc[:,'citations_outgoing'].apply(lambda x: get_formal_citations(x))
df.loc[:,'cit_not_phr_count'] = df.loc[:,'cit_formal_count'] - df.loc[:,'cit_phr_count']

# Law count 
df.loc[:,'legislation_count'] = df['legislations_cited'].apply(lambda x: get_outgoing_legislations(x))

In [None]:
df.info()

In [None]:
df.loc[:,'cit_in_binary'].value_counts()

In [None]:
df.loc[:,'legislation_count'].value_counts()

In [None]:
#remove repeated newline characters
#Summary
df.loc[:,'summary'] = df.loc[:,'summary'].apply(lambda x: re.sub(r'(\n\s*)+', ' \n', x))

# decicion
df.loc[:,'full_text'] = df.loc[:,'full_text'].apply(lambda x: re.sub(r'(\n\s*)+', ' \n', x))


In [None]:
#Get lenghts
#summary
df.loc[:,'summary_length'] = df.loc[:,'summary'].apply(lambda x: get_length(x))

# decicion
df.loc[:,'full_text_length'] = df.loc[:,'full_text'].apply(lambda x: get_length(x))

In [None]:
df.describe().applymap(lambda x: f"{x:0.3f}")

In [None]:
df.to_csv("D:\DSS D-schijf\Thesis\data\HR_full_processed.csv", index=False)

In [None]:
start = time.time()
print('Loading dataframe...')
df = pd.read_csv("D:\DSS D-schijf\Thesis\data\HR_full_processed.csv")
print(f'Done loading dataframe in {time.time() - start} seconds.')

In [None]:
def get_date(x):
    full_date = datetime.strptime(x, '%Y-%m-%d')
    return full_date

In [None]:
def get_incoming_citations(x, year):
    
    
    c = 0
    
    if x is not np.nan:
        x = eval(x)
        for reference in x:
            ecli = reference["target_ecli"]
            year_ecli = int(ecli.split(':')[3])
            
            if "latereaanleg" not in reference["type"] and year_ecli >= year :
                c += 1
        
    else: 
        return 0
    
    return c



In [None]:
# Create plot incoming citations distribution

# Convert list to DataFrame
df_graph = pd.DataFrame()
df_graph['cit_in_count'] = df.loc[:,'cit_in_count']
bins = [-0.01,0.99,2, 5, 10, 15, 20, 25, float('inf')]  # float('inf') represents infinity for the upper bound
#labels = ['[0, 512]','[513, 1024]','[1024, 2048]', '[2048, 64784]']

df_graph['Category'] = pd.cut(df_graph['cit_in_count'], bins=bins)#, labels=labels)


# Count the occurrences of each category
category_counts = df_graph['Category'].value_counts().sort_index()
category_counts = category_counts.reset_index()
category_counts.columns = ['Category', 'Count'] # Rename columns for clarity

# Create a bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Category', y='Count', data=category_counts, color='#0a6ca8')

for i, v in enumerate(category_counts['Count']):
    plt.text(i, v + 0.1, str(v), ha='center', va='bottom', fontsize=10)


plt.xlabel('Incoming citations')
plt.ylabel('Count')
plt.title('Distribution of incoming citations for Supreme Court rulings')

# Customize x-axis ticks and labels
#plt.xticks(rotation=30)  # Positions for the ticks (excluding infinity)
plt.gca().xaxis.set_ticklabels(['0', '1 - 2', '3 - 5', '6 - 10', '11 - 15', '16 - 20', '21 - 25', '26 - inf'])  # Set custom labels

plt.show()

#plt.savefig("./figs/Distribution_tokens_HR")


In [None]:
#ADAPTED FROM Schepers et al. (2023)
def sample_together(n, X, y):
    """This function is used to keep X and Y together during undersampling"""

    random.seed(0)
    rows = random.sample(np.arange(0,len(X.index)).tolist(),n)

    return X.iloc[rows,], y.iloc[rows,]


In [None]:
#ADAPTED FROM Schepers et al. (2023)
def undersample(X, y, under=0):
    """ Balance the data to the size of the smallest class. """
    
    y_min = y[y == under]
    y_max = y[y != under]
    X_min = X.filter(y_min.index, axis=0)
    X_max = X.filter(y_max.index, axis=0)

    X_under, y_under = sample_together(len(y_min.index), X_max, y_max)
    
    X = pd.concat([X_under, X_min])
    y = pd.concat([y_under, y_min])

    return X, y


In [None]:
#ADAPTED FROM Schepers et al. (2023)
def split_data(data, features, dev, balance_test_set=False):
    """ Create train, test, and dev sets. """

    print('________________________ Creating train and test data __________________________')

    data = data.sort_values(by=['date_decision'])

    # Get train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[features],
        data['cit_in_binary'], test_size=0.12, shuffle=False)

    if dev:
        # Split train in dev and train
        X_train, X_val, y_train, y_val = train_test_split(
        X_train[features],
        y_train, test_size=0.136, shuffle=False)

    # Balance train
    #X_train, y_train = undersample(X_train, y_train)

    #if balance_test_set:
        #X_test, y_test = undersample(X_test, y_test)

    print('Training data:', X_train.shape, '\n', y_train.value_counts(), '\n')
    print('Validation data:', X_val.shape, '\n', y_val.value_counts(), '\n')
    print("Test data:", X_test.shape, '\n', y_test.value_counts(), '\n')
    print()

    return X_train, y_train, X_val, y_val, X_test, y_test


In [None]:
from sklearn.base import TransformerMixin, BaseEstimator


class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        # what other output you want
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
#ADAPTED FROM Schepers et al. (2023)
def create_pipeline(categorical_features, numerical_features, decision_features, summary_features):
    """ Create the pipeline, including transformers for numerical, categorical, and textual features. The vectorizer, n-gram length and analyzer can be changed here. """
    
    transformers = []
        
    categorical_transformer = MultiHotEncoder()
    numeric_transformer = Pipeline(
        steps=[('scaler', StandardScaler())]
    )
    decision_transformer = TfidfVectorizer(analyzer='word', ngram_range=(3,3))
    summary_transformer = TfidfVectorizer(analyzer='word', ngram_range=(3,3))
    
    if categorical_features:
        transformers.append(('cat', categorical_transformer, categorical_features))
    if numerical_features:
        transformers.append(('num', numeric_transformer, numerical_features))
    if decision_features:
        transformers.append(('full_text', decision_transformer, 'full_text'))
    if summary_features:
        transformers.append(('summary', summary_transformer, 'summary'))

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop',
    )

    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor), 
            ("debug", Debug()),
            ('classifier', LinearSVC(dual= True, random_state=0, max_iter=1000)),
        ]
    )

    return pipeline

In [None]:
df.drop(columns=['creator', 'zaaknummer', 'issued', 'inhoudsindicatie', "hasVersion"], inplace= True)

In [None]:
#ADAPTED FROM Schepers et al. (2023)
# Select features and create list to extract from dataframe. Uncomment a feature to use it.

features_num = [
         'full_text_length', 
         'summary_length',
         'legislation_count',
         'cit_out_not_supremes_count',
         'cit_out_supremes_count',
         'cit_not_phr_count',
         'cit_phr_count',
          'cit_formal_count',  
]

features_cat = [
         'procedure', 
         'subject', 
]

# put 'True' to use a feature in the model
features_decision = True
features_summary = True

feature_list = features_num + features_cat 

feature_list.append('date_decision')
feature_list.append('ecli')

if features_decision:
    feature_list.append('full_text')
if features_summary:
    feature_list.append('summary')

print('Selected features:', feature_list)
print()
start = time.time()

# Create train and test data, indicate if you want to use development data

create_dev_data = True
X_train, y_train, X_val, y_val, X_test, y_test = split_data(df, feature_list, create_dev_data)


print('Time passed:', time.time() - start)
print()



In [None]:
print(X_train["date_decision"].max())
print(X_val["date_decision"].max())
print(X_test["date_decision"].max())

In [None]:
X_train.to_csv("D:\DSS D-schijf\Thesis\data\HR_X_train.csv", index=False)
X_val.to_csv("D:\DSS D-schijf\Thesis\data\HR_X_val.csv", index=False)
X_test.to_csv("D:\DSS D-schijf\Thesis\data\HR_X_test.csv", index=False)

y_train.to_csv("D:\DSS D-schijf\Thesis\data\HR_y_train.csv", index=False)
y_val.to_csv("D:\DSS D-schijf\Thesis\data\HR_y_val.csv", index=False)
y_test.to_csv("D:\DSS D-schijf\Thesis\data\HR_y_test.csv", index=False)


In [None]:
np.save("D:\DSS D-schijf\Thesis\data\HR_X_train.npy", X_train)

In [None]:
#ADAPTED FROM Schepers et al. (2023)

# Create pipeline
clf = create_pipeline(features_cat, features_num, features_decision, features_summary)

# Start train-test phase    
start = time.time()
    
print('Training...')
print(X_train.shape)
clf.fit(X_train, y_train)

print('Testing...')
print(X_val.shape)
y_pred = clf.predict(X_val)

print('_____________________ Classification Report ___________________________')
class_report = classification_report(y_val, y_pred, labels=[0, 1])
print(class_report)
print()
print('\n_____________________ Confusion Matrix _______________________________')
conf_matrix = confusion_matrix(y_val, y_pred)
print(conf_matrix)
print("\n_____________________ Matthew's Correlation Coefficient ______________")
matt_coef = matthews_corrcoef(y_val, y_pred)
print(matt_coef)
timing = time.time() - start
print('Total time passed:', timing)

# Write to output file

out_str = 'Features used:' + str(feature_list) + '\n'

with open("D:\DSS D-schijf\Thesis\HR_baseline.txt", 'a') as out:
    out.write(court)
    out.write('\n')
    out.write(out_str)
    out.write('Development Data:')
    out.write(str(create_dev_data))
    out.write('\n')
    out.write('X_train:')
    out.write(str(X_train.shape))
    out.write('\n')
    out.write(str(y_train.value_counts()))
    out.write('\n')
    out.write('X_test:')
    out.write(str(X_test.shape))
    out.write('\n')
    out.write(str(y_test.value_counts()))
    out.write('\n')
    out.write(str(class_report))
    out.write('\n')
    out.write(str(conf_matrix))
    out.write('\n')
    out.write('Matt coeff: ' + str(matt_coef))
    out.write('\n')
    out.write('total time:' + str(timing))
    out.write('\n_______________________________________________________')
    out.write('\n')
    

    out.write('\n\n')