Project by Jeremy Bouhi & Lucas Trevalinet

# Classification Document


## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import pyprind

## STEP 1 : Grab Data

In [2]:
# We consider you had already unzip the dataset file and that this file is in the parent folder

GLOBAL_PATH = os.path.join("Freemium_cass_global_20180315-170000","20180315-170000", "juri","cass","global") #Lucas' Path
#GLOBAL_PATH = os.path.join("..","20180315-170000", "juri","cass","global") #Jeremy's Path

# We need to do that for excluding data from Juri_path
CIVILE_PATH = os.path.join(GLOBAL_PATH,"civile")
COMMERCIALE_PATH = os.path.join(GLOBAL_PATH,"commerciale")
CRIMINELLE_PATH = os.path.join(GLOBAL_PATH,"criminelle")
SOCIALE_PATH = os.path.join(GLOBAL_PATH,"sociale")

In [3]:
from nltk.tokenize import sent_tokenize

ident = []
text = []
division = []
#DATA_PATH = [SOCIALE_PATH] # For faster test
DATA_PATH = [CIVILE_PATH, COMMERCIALE_PATH, CRIMINELLE_PATH, SOCIALE_PATH]

for DIVISION_PATH in DATA_PATH :
    xml_files = list(Path(DIVISION_PATH).glob('**/*.xml'))
    
    for xml_file in xml_files:
    
        with open(xml_file, 'r', encoding="utf-8") as content:

            etree = ET.parse(content) #create an ElementTree object 
            root = etree.getroot()
            
            # For getting the ID
            for child in root.iter('META_COMMUN'):
                id = child.find('ID').text
                ident.append(id)

            for child in root.iter('BLOC_TEXTUEL'):
                contenu = "".join(child.itertext())
                text.append(contenu)
                
            # For getting the division    
            for child in root.iter('META_JURI_JUDI'):
                formation = re.sub('CHAMBRE|_|[0-9]', '', child.find('FORMATION').text)
                division.append(formation)


## STEP 2 : Create DataFrame to manipulate easily data

In [4]:
d = {'id': ident, 'text': text, 'division': division}
df = pd.DataFrame(data = d)

# save for the next time
df.to_pickle('3FD8KA7.pkl')
    

In [108]:
df = pd.read_pickle('3FD8KA7.pkl')

In [109]:
# remove all empty texts
df = df[df['text']!=""]

In [110]:
fast_execution = True #to run code faster

if(fast_execution) : 
    df = df.sample(500)
    print('You chose to run using fast_execution')
    print('data size: ', df.shape[0])

else :
    print('You chose to run with the whole dataset')
    print('data size: ', df.shape[0])

You chose to run using fast_execution
data size:  500


## STEP 3 : Clean Data

In [112]:
# Preprocessing step pull off all the htlm tag, "\n", and any special characters
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove HTML tags
    text = re.sub('[,\/#!$%\^&\*:{}=\_`~()«»°–]','', text) # remove special characters except ; and . for spliting in sentences
    text = text.replace('\n','').replace('\t','').replace('\'',' ')
    text = re.sub(r'\s{2,}', ' ', text) # remove extra space
    return text

df['text'] = df['text'].apply(preprocessor)
df = df[df['text']!=""] #permet de reclean la DataFrame afin d'enlever les textes vide

Unnamed: 0,id,text,division
44182,JURITEXT000007042257,Attendu que le litige concerne les conditions ...,CIVILE
72385,JURITEXT000007029889,Sur le moyen unique Attendu selon l arrêt défé...,COMMERCIALE
22197,JURITEXT000006994967,VU LEUR CONNEXITE JOINT LES POURVOIS N 74-11 2...,CIVILE
83453,JURITEXT000007059699,STATUANT SUR LE POURVOI DE - X... JEAN JACQUES...,CRIMINELLE
71120,JURITEXT000007023327,Sur le moyen unique pris en ses deux branches...,COMMERCIALE
37001,JURITEXT000007028231,. Sur le moyen unique pris en sa première bra...,CIVILE
115274,JURITEXT000007005221,SUR LES DEUX MOYENS REUNIS VU LES ARTICLES L....,SOCIALE
115795,JURITEXT000007006475,VU LA CONNEXITE JOINT LES POURVOIS N 80-60.17...,SOCIALE
22580,JURITEXT000006995698,SUR LE MOYEN UNIQUE PRIS EN SA DEUXIEME BRANCH...,CIVILE
77868,JURITEXT000028575127,LA COUR DE CASSATION CHAMBRE COMMERCIALE a ren...,COMMERCIALE


In [113]:
# We clean data on X, we don't modify directly the DataFrame to display the right sentences (after calculated whiches ones are the most influent by using X)

X = df['text']
y = df['division']

In [114]:
# The '.'' from lews articles (for example article L. 1256) are bothering for seperate our text in sentences
def clean_references_to_law_articles(text) :
    text = re.sub(r'((?:\b[A-Z][\.-]? ?)?(?:\d+-?\d+\b))', 'N', text)
    return text

In [115]:
from nltk.tokenize import sent_tokenize

def clean_before_tokenize(text) :
    text = clean_references_to_law_articles(text)
    text = re.sub('\.\.\.','', text) # remove ...
    text = re.sub(r'\bM\. \b','M ', text) # explicit replacement of M. (because of the .)
    text = text.replace(' ;','. ') # some sentences are split by ;
    return text

def sent_tokenizer(text) : 
    text = clean_before_tokenize(text)
    text = sent_tokenize(text)
    return text

In [116]:
X = X.apply(clean_before_tokenize)
df['text'] = df['text'].apply(sent_tokenizer)

# X and df['text'] are exactly the same here unless one thing : X[i] is a signle string, df['text'][i] is an array and each element represents an array

In [117]:
def clean_references_to_law_codes(text) : 
    return re.sub('(?<=code )civil|de l action sociale et des familles|de l artisanat|des assurances|de l aviation civile|du cinéma et de l image animée|de commerce|des communes( de la Nouvelle-Calédonie)?|de la consommation', '', text)

X = X.apply(clean_references_to_law_codes)

In [118]:
X = X.apply(clean_references_to_law_articles)

In [119]:
# Remove all numbers

def remove_numbers(text) :
    text = re.sub(r'[0-9]*', '', text)
    return text

X = X.apply(remove_numbers)

In [120]:
# Set to lower case

def to_lower_case(s) :
    return s.lower()

X = X.apply(to_lower_case)


In [121]:
# We decided to replace any corresponding labels to a unique special one : *_* 

def clean_words_corresponding_to_labels(text) :
    return re.sub(r'\b(criminel(le)?|commercial(e)?|social(e)?|civil(e))\b', '*_*', text)

X = X.apply(clean_words_corresponding_to_labels)

In [122]:
# Remove all words we don't care :

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def remove_stop_words(text):
    
    french_stopwords = set(stopwords.words('french'))
    tokens = word_tokenize(text, language='french')
    
    content_tokens = ""
    for token in tokens:
        if token not in french_stopwords:
            content_tokens += token
            content_tokens += " "
    return(content_tokens) 

X = X.apply(remove_stop_words)

In [123]:
X.head()

44182    attendu litige concerne les conditions utilisa...
72385    moyen unique attendu selon arrêt déféré riom o...
22197    vu connexite joint les pourvois ; moyen unique...
83453    statuant pourvoi - x jean jacquescontre arret ...
71120    moyen unique pris deux branches . attendu selo...
Name: text, dtype: object

## Most influential words

In [124]:
X_civile = X.where(y=='CIVILE')
X_commerciale = X.where(y=='COMMERCIALE')
X_criminelle = X.where(y=='CRIMINELLE')
X_sociale = X.where(y=='SOCIALE')

#to remove NaN values
X_civile = X_civile[~X_civile.isnull()]
X_commerciale = X_commerciale[~X_commerciale.isnull()]
X_criminelle = X_criminelle[~X_criminelle.isnull()]
X_sociale = X_sociale[~X_sociale.isnull()]

In [125]:
# so they all have differents shapes
print(X_civile.shape[0])
print(X_commerciale.shape[0])
print(X_criminelle.shape[0])
print(X_sociale.shape[0])

212
76
66
132


In [126]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def get_tfidf_matrix(X) : 
    
    count_vect = CountVectorizer()
    freq_term_matrix = count_vect.fit_transform(X)

    tfidf = TfidfTransformer(norm="l2")
    doc_tfidf_matrix = tfidf.fit_transform(freq_term_matrix)
    
    return (doc_tfidf_matrix, count_vect)

In [127]:
def get_influential_words(X, top=10) :
    
    tfidf_matrix, vec = get_tfidf_matrix(X)
    
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
    feature_array = np.array(vec.get_feature_names())
    top_n = feature_array[tfidf_sorting][:top]
    
    return top_n

In [128]:
top_civile = get_influential_words(X_civile)
top_commerciale = get_influential_words(X_commerciale)
top_criminelle = get_influential_words(X_criminelle)
top_sociale = get_influential_words(X_sociale)

In [129]:
per_division = {'civile': top_civile, 'commerciale': top_commerciale, 'criminelle': top_criminelle, 'sociale': top_sociale}
words_df = pd.DataFrame(data = per_division)

words_df

Unnamed: 0,civile,commerciale,criminelle,sociale
0,vente,soins,proximité,résiliation
1,redressement,services,michel,autorisation
2,adoption,enregistrement,indemnisation,judiciaire
3,surendettement,similitude,juridiction,licenciement
4,atelier,beauté,angers,employeur
5,immeuble,animaux,relaxe,administrative
6,logement,marque,présentée,monsieur
7,préalable,les,statuer,johnny
8,griefs,annulation,pénale,travail
9,mesures,appréciation,procédure,manquements


## Build document summarizer

Once you'll build the tf-idf matrix for the text corpus, you will use the tf-idf of each word to compute a value for each sentence. The n top sentences for a document will be used to represent the document.

In [130]:
#if fast_execution was chosen, find the 1st right index of X (because we only took a sample)

index = 0
while (index not in X) :
    index += 1
print(index)

508


In [139]:
X_tokenized = X.apply(sent_tokenize)
tfidf_matrix, vec = get_tfidf_matrix(X_tokenized[index])

In [140]:
def get_top_sentences_for_document(formatted_X, real_X, index, top=5) :

    top_sentences = []
    sentence_score = []
    tfidf_matrix, vec = get_tfidf_matrix(formatted_X[index])

    for i in range(tfidf_matrix.shape[0]) :
        mat = tfidf_matrix[i].toarray()

        # The score of the sentence corresponds to the sum of the tf-idf of each word / number of words
        sentence_score.append(np.sum(mat) / len(mat[mat > 0]))

    print('score of each sentence : ', sentence_score) 
    top_sentence_score = np.argsort(sentence_score)[::-1][:top]
    print('index of the top sentences which can summarize the document: ' ,top_sentence_score)

    for i in range(top) : 
        top_sentences.append(real_X[index][top_sentence_score[i]])
    

    return top_sentences

In [141]:
top_sentences = get_top_sentences_for_document(X_tokenized, df['text'], index)

score of each sentence :  [0.045055665544014384]
index of the top sentences which can summarize the document:  [0]


IndexError: index 1 is out of bounds for axis 0 with size 1

In [51]:
pd.DataFrame(data = {'top sentences of the document' : top_sentences})

Unnamed: 0,top sentences of the document
0,Remploi N.N €.
1,Dépréciation du surplus N.N €.
2,D où il suit que le moyen n est pas fondé.
3,PAR CES MOTIFS REJETTE le pourvoi Condamne Mme...
4,que le juge doit au besoin d office s assurer ...


In [42]:
# To display entirely
print('number of sentences of this document: ', len(X_tokenized[index]))
print('top sentences: ', top_sentences)

number of sentences of this document:  24
top sentences:  ['Remploi N.N €.', 'Dépréciation du surplus N.N €.', 'D où il suit que le moyen n est pas fondé.', 'PAR CES MOTIFS REJETTE le pourvoi Condamne Mmes H et Nicole X aux dépens.', 'que le juge doit au besoin d office s assurer que ces dispositions ont été respectées.']


## Classifier

In [83]:
from sklearn.model_selection import train_test_split

print("Y shape", type(y),y.shape)
print("X shape", type(X),X.shape)
print(X.head())

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2 ,random_state=42 )

#print(X_train.head())
print(len(X_train))

Y shape <class 'pandas.core.series.Series'> (500,)
X shape <class 'pandas.core.series.Series'> (500,)
83706    statuant pourvoi - x antoine partie *_* contre...
27608    premier moyen vu article nouveau code procedur...
40355    attendu société maison idéale a souscrit avril...
35628    . moyen unique vu article code . attendu résul...
50739    nom peuple francais cour cassation troisieme c...
Name: text, dtype: object
400


In [84]:
### A RAJOUTER #####
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
type(X_train_counts)
#print(X_train_counts)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
#print(type(X_train_transformed))
#print(X_train_transformed)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)
#print(type(X_test_transformed))
#print(X_test_transformed)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_labels_trf = labels.transform(y_train)


print(labels.classes_)

['CIVILE' 'COMMERCIALE' 'CRIMINELLE' 'SOCIALE']
CIVILE


In [85]:
#### A RAJOUTER ####
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed,y_train_labels_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator = linear_svc, cv = "prefit")
calibrated_svc.fit(X_train_transformed,y_train_labels_trf)
predicted = calibrated_svc.predict(X_test_transformed)
print(X_test_transformed.shape)

print("pred",predicted)
print("",y_test.shape)
y_test = labels.transform(y_test) #a rajouter
print("test",y_test[0:100])

acc = accuracy_score(y_test, predicted)
print("accuracy of classsifation",acc)

(100, 12085)
pred [0 0 0 1 0 0 0 0 0 0 0 0 0 0 3 1 0 2 0 0 3 0 0 1 0 2 1 2 0 1 3 0 3 3 0 0 0
 3 2 0 0 0 1 1 0 1 2 2 3 3 0 3 0 0 3 0 2 0 0 3 0 1 0 0 2 3 3 3 0 2 2 3 3 0
 2 0 3 0 0 2 1 1 3 2 0 3 1 0 3 0 3 0 3 3 3 3 1 3 2 2]
 (100,)
test [0 0 0 1 0 0 0 0 0 1 1 1 0 0 3 1 0 2 0 1 3 0 0 1 0 2 2 2 3 1 3 0 3 3 0 0 0
 3 2 0 0 0 1 1 0 0 2 2 3 0 0 3 0 0 3 0 2 0 0 3 3 1 0 0 2 0 3 3 0 2 2 3 3 0
 2 0 3 0 0 2 1 1 3 2 0 0 0 0 0 0 3 1 1 3 3 3 1 3 2 2]
accuracy of classsifation 0.85


In [86]:
# a rajouter
""""
Permet de convertir le y_test qui est une array de chiffre en une array de string avec les bonnes classes : 
'CIVILE' = 0
'COMMERCIALE' = 1 
'CRIMINELLE' = 2 
'SOCIALE' = 3 
"""
def classification_int_to_text (y) :
    y2=[]
    print("y2 :",len(y2))
    print("y :",len(y))
    for i in range (0,len(y)):
        if y[i] == 0 :
            y2.append(labels.classes_[0])
        elif y[i] == 1 : 
            y2.append(labels.classes_[1])
        elif y[i] == 2 : 
            y2.append(labels.classes_[2])
        elif y[i] == 3 : 
            y2.append(labels.classes_[3])
    return y2

In [87]:
# a rajouter
Predicted2 = []
y_test2 = []
print("pred",predicted)
Predicted2 = classification_int_to_text (predicted)
print("pred",Predicted2)

pred [0 0 0 1 0 0 0 0 0 0 0 0 0 0 3 1 0 2 0 0 3 0 0 1 0 2 1 2 0 1 3 0 3 3 0 0 0
 3 2 0 0 0 1 1 0 1 2 2 3 3 0 3 0 0 3 0 2 0 0 3 0 1 0 0 2 3 3 3 0 2 2 3 3 0
 2 0 3 0 0 2 1 1 3 2 0 3 1 0 3 0 3 0 3 3 3 3 1 3 2 2]
y2 : 0
y : 100
pred ['CIVILE', 'CIVILE', 'CIVILE', 'COMMERCIALE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'CIVILE', 'SOCIALE', 'COMMERCIALE', 'CIVILE', 'CRIMINELLE', 'CIVILE', 'CIVILE', 'SOCIALE', 'CIVILE', 'CIVILE', 'COMMERCIALE', 'CIVILE', 'CRIMINELLE', 'COMMERCIALE', 'CRIMINELLE', 'CIVILE', 'COMMERCIALE', 'SOCIALE', 'CIVILE', 'SOCIALE', 'SOCIALE', 'CIVILE', 'CIVILE', 'CIVILE', 'SOCIALE', 'CRIMINELLE', 'CIVILE', 'CIVILE', 'CIVILE', 'COMMERCIALE', 'COMMERCIALE', 'CIVILE', 'COMMERCIALE', 'CRIMINELLE', 'CRIMINELLE', 'SOCIALE', 'SOCIALE', 'CIVILE', 'SOCIALE', 'CIVILE', 'CIVILE', 'SOCIALE', 'CIVILE', 'CRIMINELLE', 'CIVILE', 'CIVILE', 'SOCIALE', 'CIVILE', 'COMMERCIALE', 'CIVILE', 'CIVILE', 'CRIMINELLE', 'SOCIALE', 'SOCIALE', 'SOCIALE',

In [88]:
# a rajouter
#FINAL DATA FRAME : PREDICTION DATAFRAME 
prediction_data = {'Law text': X_test, 'Classification Prediction': Predicted2}
Prediction_DF = pd.DataFrame(data = prediction_data)

Prediction_DF.head()

Unnamed: 0,Law text,Classification Prediction
56832,cour cassation troisième chambre *_* a rendu a...,CIVILE
31877,moyen unique pris diverses branches attendu se...,CIVILE
17632,premier moyen attendu ressort arret attaque ma...,CIVILE
61558,premier moyen pris diverses branches attendu s...,COMMERCIALE
37741,. moyen unique vu article code civil ensemble ...,CIVILE
