Project by Jeremy Bouhi & Lucas Trevalinet

# Classification Document


## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import pyprind

## STEP 1 : Grab Data

In [3]:
# We consider you had already unzip the dataset file and that this file is in the parent folder

#GLOBAL_PATH = os.path.join("Freemium_cass_global_20180315-170000","20180315-170000", "juri","cass","global") #Lucas' Path
GLOBAL_PATH = os.path.join("..","20180315-170000", "juri","cass","global") #Jeremy's Path

# We need to do that for excluding data from Juri_path
CIVILE_PATH = os.path.join(GLOBAL_PATH,"civile")
COMMERCIALE_PATH = os.path.join(GLOBAL_PATH,"commerciale")
CRIMINELLE_PATH = os.path.join(GLOBAL_PATH,"criminelle")
SOCIALE_PATH = os.path.join(GLOBAL_PATH,"sociale")

In [4]:
from nltk.tokenize import sent_tokenize

ident = []
text = []
division = []
#DATA_PATH = [SOCIALE_PATH] # For faster test
DATA_PATH = [CIVILE_PATH, COMMERCIALE_PATH, CRIMINELLE_PATH, SOCIALE_PATH]

for DIVISION_PATH in DATA_PATH :
    xml_files = list(Path(DIVISION_PATH).glob('**/*.xml'))
    
    for xml_file in xml_files:
    
        with open(xml_file, 'r', encoding="utf-8") as content:

            etree = ET.parse(content) #create an ElementTree object 
            root = etree.getroot()
            
            # For getting the ID
            for child in root.iter('META_COMMUN'):
                id = child.find('ID').text
                ident.append(id)

            for child in root.iter('BLOC_TEXTUEL'):
                contenu = "".join(child.itertext())
                text.append(contenu)
                
            # For getting the division    
            for child in root.iter('META_JURI_JUDI'):
                formation = re.sub('CHAMBRE|_|[0-9]', '', child.find('FORMATION').text)
                division.append(formation)


## STEP 2 : Create DataFrame to manipulate easily data

In [21]:
d = {'id': ident, 'text': text, 'division': division}
df = pd.DataFrame(data = d)

# save for the next time
df.to_pickle('3FD8KA7.pkl')
    

In [12]:
df = pd.read_pickle('3FD8KA7.pkl')

In [15]:
# remove all empty texts
df = df[df['text']!=""]

In [16]:
fast_execution = True #to run code faster

if(fast_execution) : 
    df = df.sample(500)
    print('You chose to run using fast_execution')
    print('data size: ', df.shape[0])

else :
    print('You chose to run with the whole dataset')
    print('data size: ', df.shape[0])

You chose to run using fast_execution
data size:  500


## STEP 3 : Clean Data

In [17]:
# Preprocessing step pull off all the htlm tag, "\n", and any special characters
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove HTML tags
    text = re.sub('[,\/#!$%\^&\*:{}=\_`~()«»°–]','', text) # remove special characters except ; and . for spliting in sentences
    text = text.replace('\n','').replace('\t','').replace('\'',' ')
    text = re.sub(r'\s{2,}', ' ', text) # remove extra space
    return text

df['text'] = df['text'].apply(preprocessor)

In [18]:
# We clean data on X, we don't modify directly the DataFrame to display the right sentences (after calculated whiches ones are the most influent by using X)

X = df['text']
y = df['division']

In [19]:
# The '.'' from lews articles (for example article L. 1256) are bothering for seperate our text in sentences
def clean_references_to_law_articles(text) :
    text = re.sub(r'((?:\b[A-Z][\.-]? ?)?(?:\d+-?\d+\b))', 'N', text)
    return text

In [20]:
from nltk.tokenize import sent_tokenize

def clean_before_tokenize(text) :
    text = clean_references_to_law_articles(text)
    text = re.sub('\.\.\.','', text) # remove ...
    text = re.sub(r'\bM\. \b','M ', text) # explicit replacement of M. (because of the .)
    text = text.replace(' ;','. ') # some sentences are split by ;
    return text

def sent_tokenizer(text) : 
    text = clean_before_tokenize(text)
    text = sent_tokenize(text)
    return text

In [21]:
X = X.apply(clean_before_tokenize)
df['text'] = df['text'].apply(sent_tokenizer)

# X and df['text'] are exactly the same here unless one thing : X[i] is a signle string, df['text'][i] is an array and each element represents an array

In [24]:
def clean_references_to_law_codes(text) : 
    return re.sub('(?<=code )civil|de l action sociale et des familles|de l artisanat|des assurances|de l aviation civile|du cinéma et de l image animée|de commerce|des communes( de la Nouvelle-Calédonie)?|de la consommation', '', text)

X = X.apply(clean_references_to_law_codes)

In [25]:
X = X.apply(clean_references_to_law_articles)

In [26]:
# Remove all numbers

def remove_numbers(text) :
    text = re.sub(r'[0-9]*', '', text)
    return text

X = X.apply(remove_numbers)

In [27]:
# Set to lower case

def to_lower_case(s) :
    return s.lower()

X = X.apply(to_lower_case)


In [28]:
# We decided to replace any corresponding labels to a unique special one : *_* 

def clean_words_corresponding_to_labels(text) :
    return re.sub(r'\b(criminel(le)?|commercial(e)?|social(e)?|civil(e))\b', '*_*', text)

X = X.apply(clean_words_corresponding_to_labels)

In [29]:
# Remove all words we don't care :

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def remove_stop_words(text):
    
    french_stopwords = set(stopwords.words('french'))
    tokens = word_tokenize(text, language='french')
    
    content_tokens = ""
    for token in tokens:
        if token not in french_stopwords:
            content_tokens += token
            content_tokens += " "
    return(content_tokens) 

X = X.apply(remove_stop_words)

## Most influential words

In [31]:
X_civile = X.where(y=='CIVILE')
X_commerciale = X.where(y=='COMMERCIALE')
X_criminelle = X.where(y=='CRIMINELLE')
X_sociale = X.where(y=='SOCIALE')

#to remove NaN values
X_civile = X_civile[~X_civile.isnull()]
X_commerciale = X_commerciale[~X_commerciale.isnull()]
X_criminelle = X_criminelle[~X_criminelle.isnull()]
X_sociale = X_sociale[~X_sociale.isnull()]

In [32]:
# so they all have differents shapes
print(X_civile.shape[0])
print(X_commerciale.shape[0])
print(X_criminelle.shape[0])
print(X_sociale.shape[0])

225
78
69
128


In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def get_tfidf_matrix(X) : 
    
    count_vect = CountVectorizer()
    freq_term_matrix = count_vect.fit_transform(X)

    tfidf = TfidfTransformer(norm="l2")
    doc_tfidf_matrix = tfidf.fit_transform(freq_term_matrix)
    
    return (doc_tfidf_matrix, count_vect)

In [34]:
def get_influential_words(X, top=10) :
    
    tfidf_matrix, vec = get_tfidf_matrix(X)
    
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
    feature_array = np.array(vec.get_feature_names())
    top_n = feature_array[tfidf_sorting][:top]
    
    return top_n

In [35]:
top_civile = get_influential_words(X_civile)
top_commerciale = get_influential_words(X_commerciale)
top_criminelle = get_influential_words(X_criminelle)
top_sociale = get_influential_words(X_sociale)

In [36]:
per_division = {'civile': top_civile, 'commerciale': top_commerciale, 'criminelle': top_criminelle, 'sociale': top_sociale}
words_df = pd.DataFrame(data = per_division)

words_df

Unnamed: 0,civile,commerciale,criminelle,sociale
0,pension,acte,fils,frimigacci
1,alimentaire,omissions,père,blamable
2,no,inexactitudes,mère,legerete
3,visite,promesse,mineur,loi
4,femme,epoux,parentale,lachize
5,ex,acquerir,enfant,nuire
6,les,les,civilement,manifeste
7,epoux,exploit,assises,preuve
8,conjugal,non,responsable,intention
9,enfant,ete,autorité,etabli


## Build document summarizer

Once you'll build the tf-idf matrix for the text corpus, you will use the tf-idf of each word to compute a value for each sentence. The n top sentences for a document will be used to represent the document.

In [37]:
#if fast_execution was chosen, find the 1st right index of X (because we only took a sample)

index = 0
while (index not in X) :
    index += 1
print(index)

437


In [38]:
X_tokenized = X.apply(sent_tokenize)
tfidf_matrix, vec = get_tfidf_matrix(X_tokenized[index])

In [49]:
def get_top_sentences_for_document(formatted_X, real_X, index, top=5) :

    top_sentences = []
    sentence_score = []
    tfidf_matrix, vec = get_tfidf_matrix(formatted_X[index])

    for i in range(tfidf_matrix.shape[0]) :
        mat = tfidf_matrix[i].toarray()

        # The score of the sentence corresponds to the sum of the tf-idf of each word / number of words
        sentence_score.append(np.sum(mat) / len(mat[mat > 0]))

    print('score of each sentence : ', sentence_score) 
    top_sentence_score = np.argsort(sentence_score)[::-1][:top]
    print('index of the top sentences which can summarize the document: ' ,top_sentence_score)

    for i in range(top) : 
        top_sentences.append(real_X[index][top_sentence_score[i]])
    

    return top_sentences

In [50]:
top_sentences = get_top_sentences_for_document(X_tokenized, df['text'], index)

score of each sentence :  [0.15623339920194115, 0.1567667271658211, 0.3725512846138429, 0.1996243184851927, 0.18134733021715574, 0.1349693587617249, 0.17665500618206795, 0.17365259381537232, 0.13303555231114095, 0.4963229708433413, 0.3754581885089165, 0.2137402467354794, 0.23039986428680045, 0.06569902189108899, 1.0, 0.7071067811865476, 0.08481913551305212, 0.21157385472203713, 0.3725512846138429, 0.037003182971877616, 0.04675415271709843, 0.15862067097787377, 0.17665500618206795, 0.17341708628973831]
index of the top sentences which can summarize the document:  [14 15  9 10  2]


In [51]:
pd.DataFrame(data = {'top sentences of the document' : top_sentences})

Unnamed: 0,top sentences of the document
0,Remploi N.N €.
1,Dépréciation du surplus N.N €.
2,D où il suit que le moyen n est pas fondé.
3,PAR CES MOTIFS REJETTE le pourvoi Condamne Mme...
4,que le juge doit au besoin d office s assurer ...


In [42]:
# To display entirely
print('number of sentences of this document: ', len(X_tokenized[index]))
print('top sentences: ', top_sentences)

number of sentences of this document:  24
top sentences:  ['Remploi N.N €.', 'Dépréciation du surplus N.N €.', 'D où il suit que le moyen n est pas fondé.', 'PAR CES MOTIFS REJETTE le pourvoi Condamne Mmes H et Nicole X aux dépens.', 'que le juge doit au besoin d office s assurer que ces dispositions ont été respectées.']


## Classifier

In [44]:
from sklearn.model_selection import train_test_split

print("Y shape", type(y),y.shape)
print("X shape", type(X),X.shape)
print(X.head())

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2 ,random_state=42 )

print(X_train.head())

Y shape <class 'pandas.core.series.Series'> (500,)
X shape <class 'pandas.core.series.Series'> (500,)
84492    rejet pourvoi x alexandre contre arret chambre...
57150    attendu arret confirmatif attaque accueillant ...
44718    moyen unique pris divers griefs attendu honore...
68077    moyen unique pris quatre branches attendu mme ...
58659    recevabilite pourvoi attendu voie cassation ou...
Name: text, dtype: object
72984     moyen unique pris deux branches attendu fait g...
100422    cour cassation chambre *_* a rendu arrêt suiva...
94673     statuant pourvoi forme - procureur general pre...
1318      cour cassation deuxième chambre *_* a rendu ar...
113347    moyen unique vu les articles n- code travail ....
Name: text, dtype: object


In [45]:
### A RAJOUTER #####
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
type(X_train_counts)
#print(X_train_counts)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
#print(type(X_train_transformed))
#print(X_train_transformed)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)
#print(type(X_test_transformed))
#print(X_test_transformed)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_labels_trf = labels.transform(y_train)


print(labels.classes_)

['CIVILE' 'COMMERCIALE' 'CRIMINELLE' 'SOCIALE']


In [46]:
#### A RAJOUTER ####
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed,y_train_labels_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator = linear_svc, cv = "prefit")
calibrated_svc.fit(X_train_transformed,y_train_labels_trf)
predicted = calibrated_svc.predict(X_test_transformed)
print(X_test_transformed.shape)

print("pred",predicted)
print("",y_test.shape)
print("test",y_test[0:100])

acc = accuracy_score(y_test, predicted)
print("accuracy",acc)

(100, 13155)
pred [0 2 0 0 0 1 0 0 0 1 1 3 0 0 0 3 2 0 3 1 0 3 0 0 0 3 0 2 0 3 0 2 0 3 0 0 2
 3 1 0 0 3 2 0 2 0 2 0 3 2 2 0 3 0 2 1 3 0 3 2 0 0 1 0 0 3 3 0 0 3 1 1 0 0
 1 3 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 3 2 0 0 2 0 0 3 0]
 (100,)
test 116253        SOCIALE
95765      CRIMINELLE
34941          CIVILE
68607     COMMERCIALE
38022          CIVILE
74404     COMMERCIALE
14571          CIVILE
71617     COMMERCIALE
31935          CIVILE
62215     COMMERCIALE
70576     COMMERCIALE
125993        SOCIALE
13480          CIVILE
52490          CIVILE
26071          CIVILE
128313        SOCIALE
90547      CRIMINELLE
25663          CIVILE
129043        SOCIALE
72619     COMMERCIALE
31827          CIVILE
117049        SOCIALE
28879          CIVILE
28045          CIVILE
45163          CIVILE
107247        SOCIALE
118193        SOCIALE
84376      CRIMINELLE
58338          CIVILE
122670        SOCIALE
             ...     
66577     COMMERCIALE
72789     COMMERCIALE
4028           CIVILE
9181           CI