Project by Jeremy Bouhi & Lucas Trevalinet

# Classification Document


## Imports

In [1]:
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import pyprind

## STEP 1 : Grab Data

In [2]:
# We consider you had already unzip the dataset file and that this file is in the parent folder

GLOBAL_PATH = os.path.join("Freemium_cass_global_20180315-170000","20180315-170000", "juri","cass","global") #Lucas' Path
#GLOBAL_PATH = os.path.join("..","20180315-170000", "juri","cass","global") #Jeremy's Path

# We need to do that for excluding data from Juri_path
CIVILE_PATH = os.path.join(GLOBAL_PATH,"civile")
COMMERCIALE_PATH = os.path.join(GLOBAL_PATH,"commerciale")
CRIMINELLE_PATH = os.path.join(GLOBAL_PATH,"criminelle")
SOCIALE_PATH = os.path.join(GLOBAL_PATH,"sociale")

In [3]:
from nltk.tokenize import sent_tokenize

ident = []
text = []
division = []
DATA_PATH = [SOCIALE_PATH, COMMERCIALE_PATH] # For faster test
#DATA_PATH = [CIVILE_PATH, COMMERCIALE_PATH, CRIMINELLE_PATH, SOCIALE_PATH]

for DIVISION_PATH in DATA_PATH :
    xml_files = list(Path(DIVISION_PATH).glob('**/*.xml'))
    
    for xml_file in xml_files:
    
        with open(xml_file, 'r', encoding="utf-8") as content:

            etree = ET.parse(content) #create an ElementTree object 
            root = etree.getroot()
            
            # For getting the ID
            for child in root.iter('META_COMMUN'):
                id = child.find('ID').text
                ident.append(id)

            
            for child in root.iter('BLOC_TEXTUEL'):
                contenu = "".join(child.itertext())
                text.append(contenu)
                
            # For getting the division    
            for child in root.iter('META_JURI_JUDI'):
                formation = re.sub('CHAMBRE|_|[0-9]', '', child.find('FORMATION').text)
                division.append(formation)


## STEP 2 : Create DataFrame to manipulate easily data

In [4]:
d = {'id': ident, 'text': text, 'division': division}
df = pd.DataFrame(data = d)
    
# remove all NaN values from df
df.dropna()

# save for the next time
df.to_pickle('3FD8KA7.pkl')
    

In [6]:
df = pd.read_pickle('3FD8KA7.pkl')

In [10]:
df.head()

Unnamed: 0,id,text,division
0,JURITEXT000006951657,\n\n,SOCIALE
1,JURITEXT000006951658,\n\n,SOCIALE
2,JURITEXT000006951659,\n\n,SOCIALE
3,JURITEXT000006951660,\n\n,SOCIALE
4,JURITEXT000006951661,\n\n,SOCIALE


In [29]:
fast_execution = True #to run code faster
mini_df = df.sample(500)
print(mini_df)

if(fast_execution) : 
    X = mini_df['text']
    y = mini_df['division']
    print('You chose to run using fast_execution')
    print('data size: ', X.shape[0])
    print('output size: ', y.shape[0]) #rajouter
else :
    X = df['text']
    y = df['division']
    print('You chose to run with the whole dataset')
    print('data size: ', X.shape[0])

                         id  \
45070  JURITEXT000007012532   
19285  JURITEXT000007010918   
15478  JURITEXT000007001943   
32738  JURITEXT000021855224   
42239  JURITEXT000006996243   
20703  JURITEXT000007015055   
9693   JURITEXT000006983505   
44962  JURITEXT000007011966   
48922  JURITEXT000007032632   
40903  JURITEXT000006987479   
52495  JURITEXT000018204049   
49907  JURITEXT000007037852   
30692  JURITEXT000007048147   
44226  JURITEXT000007008195   
27071  JURITEXT000007035075   
22406  JURITEXT000007019247   
15883  JURITEXT000007002900   
3718   JURITEXT000006963426   
22796  JURITEXT000007020293   
37708  JURITEXT000006967364   
31467  JURITEXT000007051672   
12349  JURITEXT000006992288   
39956  JURITEXT000006980569   
30468  JURITEXT000007046981   
2488   JURITEXT000006959640   
11243  JURITEXT000006988548   
38617  JURITEXT000006972545   
4710   JURITEXT000006966581   
50062  JURITEXT000007038881   
4897   JURITEXT000006967258   
...                     ...   
49025  J

## STEP 4 : Clean Data

In [61]:
def clean_references_to_law_codes(text) : 
    return re.sub('(?<=Code )civil|de l\'action sociale et des familles|de l\'artisanat|des assurances|de l\'aviation civile|du cinéma et de l image animée|de commerce|des communes( de la Nouvelle-Calédonie)?|de la consommation', '', ''.join(text))


X = X.apply(clean_references_to_law_codes)

In [62]:
def clean_references_to_law_articles(text) :
    return re.sub(r'((?:\b[A-Z][\.-]? ?)?(?:\d+-?\d+\b))', 'N', ''.join(text))

X = X.apply(clean_references_to_law_articles)

In [63]:
# Preprocessing step pull off all the htlm tag, "\n", and any special characters
def preprocessor(text):
    text = re.sub('<[^>]*>', '', ''.join(text)) # remove HTML tags
    text = re.sub('[,\/#!$%\^&\*:{}=\_`~()«»°–]','', ''.join(text)) # remove special characters except ; and . for spliting in sentences
    text = re.sub('\.\.\.','', ''.join(text)) # remove ...
    text = re.sub('\bM\. \b','M ', ''.join(text)) # explicit replacement of M. (because of the .)
    text = text.replace(';','.') # some sentences are split by ;
    text = text.replace('\n','').replace('\'',' ')
    text.replace('\s{2,}',' ')
    return text

X = X.apply(preprocessor)

In [64]:
# Set to lower case
def to_lower_case(s) :
    return s.lower()

X = X.apply(to_lower_case)

In [65]:
# We decided to replace any corresponding labels to a unique special label : *_* 
def clean_words_corresponding_to_labels(text) :
    return re.sub('criminel(le)?|commercial(e)?|social(e)?|civil(e)?', '*_*', ''.join(text))

X = X.apply(clean_words_corresponding_to_labels)

In [66]:
#Stop_words :
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def remove_stop_words(text):
    
    french_stopwords = set(stopwords.words('french'))
    tokens = word_tokenize(text, language='french')
    
    content_tokens = ""
    for token in tokens:
        if token not in french_stopwords:
            content_tokens += token
            content_tokens += " "
    return(content_tokens) 

X = X.apply(remove_stop_words)

In [67]:
X.head()

45070    moyen unique pris premiere branche vu les arti...
19285    premier moyen vu article code securite*_* les ...
15478    moyen unique vu article code travail .attendu ...
32738    cour cassation chambre *_* a rendu arrêt suiva...
42239    moyen unique pris trois branches attendu fait ...
Name: text, dtype: object

In [18]:
text1 = "coucou c'est moi les amis je suis super content étais etait"  

In [20]:
remove_stop_words(text1)

"coucou c'est les amis super content etait "

## Most influential words

In [68]:
X_civile = X.where(y=='CIVILE')
X_commerciale = X.where(y=='COMMERCIALE')
X_criminelle = X.where(y=='CRIMINELLE')
X_sociale = X.where(y=='SOCIALE')

#to remove NaN values
X_civile = X_civile[~X_civile.isnull()]
X_commerciale = X_commerciale[~X_commerciale.isnull()]
X_criminelle = X_criminelle[~X_criminelle.isnull()]
X_sociale = X_sociale[~X_sociale.isnull()]

In [69]:
#so they all have differents shapes
print(X_civile.shape[0])
print(X_commerciale.shape[0])
print(X_criminelle.shape[0])
print(X_sociale.shape[0])

0
179
0
321


In [70]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def get_influential_words(X, top=10) :
    count_vect = CountVectorizer()
    count_vect = count_vect.fit(X)
    freq_term_matrix = count_vect.transform(X)

    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    
    doc_freq_term = count_vect.transform(X)
    doc_tfidf_matrix = tfidf.transform(doc_freq_term)
    
    tfidf_sorting = np.argsort(doc_tfidf_matrix.toarray()).flatten()[::-1]
    feature_array = np.array(count_vect.get_feature_names())
    top_n = feature_array[tfidf_sorting][:top]
    
    return top_n

In [71]:
#top_civile = get_influential_words(X_civile)
top_commerciale = get_influential_words(X_commerciale)
#top_criminelle = get_influential_words(X_criminelle)
top_sociale = get_influential_words(X_sociale)

In [72]:
#per_division = {'civile': top_civile, 'commerciale': top_commerciale, 'criminelle': top_criminelle, 'sociale': top_sociale}
per_division = {'commerciale': top_commerciale,'sociale': top_sociale}
words_df = pd.DataFrame(data = per_division)

words_df

Unnamed: 0,commerciale,sociale
0,intervilles,convention
1,marque,collective
2,guy,bonsecours
3,jeux,notre
4,denomination,mas
5,depot,nationale
6,notoriete,etablissements
7,titre,inadaptee
8,fraude,moulins
9,appelation,enfance


In [75]:
X[45070]

"moyen unique pris premiere branche vu les articles code *_* .attendu selon arret defere societe `` consortium industriel travaux publics batiment '' cib a contrat juin donne `` nantissement '' a barclays bank banque marche conclu sci `` passerelle '' sci contrat intitule `` nantissement marche prive - cession creance - articles code *_* '' dont unique exemplaire remis a banque stipulait nantissement intervenait comme garantie subsidiaire . liquidation biens cib a ete prononcee septembre octobre sci reglement travaux executes anterieurement jugement declaratif a verse directement a banque somme syndic a ulterieurement demande rapporter a masse .attendu condamner banque a restituer syndic somme versee sci arret enonce contrat juin `` cession creance nantissement marche prive est-a-dire gage confere a beneficiaire droit propriete celle-ci '' .attendu determinant ainsi alors contrat enoncait nantissement marche representait garantie subsidiaire banque beneficiait cession creance reguliere

## Build document summarizer

Once you'll build the tf-idf matrix for the text corpus, you will use the tf-idf of each word to compute a value for each sentence. The n top sentences for a document will be used to represent the document.

In [119]:
from nltk.tokenize import sent_tokenize

print(df['text'][52075])
print(X[52075])

sent_tokenize(X[52075])




   SUR LE MOYEN UNIQUE PRIS EN SES DEUX BRANCHES :
    ATTENDU QU'IL EST FAIT GRIEF A L'ARRET CONFIRMATIF ATTAQUE D'AVOIR DECLARE COMMERCIAL LE BAIL LITIGIEUX, CONSTATE L'ABSENCE D'UNE EXPLOITATION ARTISANALE OU COMMERCIALE ET LE CHANGEMENT DE DESTINATION DES LIEUX, VALIDE LE CONGE DELIVRE PAR DAME Y..., PROPRIETAIRE, A MATHIEU, PRENEUR, ET PRONONCE L'EXPULSION DE CELUI-CI, ALORS, SELON LE MOYEN, QUE, D'UNE PART, LEDIT PRENEUR AVAIT, DANS SES CONCLUSIONS D'APPEL DEMEUREES SANS REPONSE, SOULIGNE QUE L'ALLUSION AU CARACTERE COMMERCIAL D'UNE PARTIE DES LOCAUX ETAIT "SANS INCIDENCE" DES LORS QUE LE CONTRAT NE COMPORTAIT AUCUNE DES CLAUSES USUELLES D'UN BAIL COMMERCIAL, SPECIALEMENT LA NECESSITE D'EXPLOITER DANS LES LIEUX UN FONDS DE COMMERCE ET DE GARNIR LES LOCAUX LOUES DE MATERIEL ET DE MARCHANDISES SUFFISANTS POUR GARANTIR LE PAYEMENT DES LOYERS ET QUE, D'AUTRE PART, LA VOLONTE DES PARTIES DE CONCLURE UN BAIL COMMERCIAL "NE SUFFISANT PAS A LA LEGISLATION SUR LES BAUX COMMERCIAUX", LES 

["moyen unique pris deux branches attendu fait grief a arret confirmatif attaque avoir declare *_* bail litigieux constate absence exploitation artisanale *_* changement destination lieux valide conge delivre dame proprietaire a mathieu preneur prononce expulsion celui-ci alors selon moyen part ledit preneur conclusions appel demeurees sans reponse souligne allusion caractere *_* partie locaux etait `` sans incidence '' lors contrat comportait aucune clauses usuelles bail *_* specialement necessite exploiter les lieux fonds commerce garnir les locaux loues materiel marchandises suffisants garantir payement loyers autre part volonte parties conclure bail *_* `` suffisant a legislation les baux commerciaux '' les juges fond devaient rechercher quelles ete fait intention parties nature utilisation lieux .",
 'attendu caractere location determine non usage locataire a pu faire chose louee destination les parties entendu donner lors conclusion contrat .',
 "adoption motifs cour appel consta

In [68]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X)
freq_term_matrix = count_vect.transform(X)

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
    
doc_freq_term = count_vect.transform(X)
doc_tfidf_matrix = tfidf.transform(doc_freq_term)

['\nSUR LE PREMIER MOYEN, PRIS DE LA VIOLATION DES ARTICLES 541 DU CODE DE PROCEDURE CIVILE, 24-A DU LIVRE 1ER DU CODE DU TRAVAIL ET 7 DE LA LOI DU 20 AVRIL 1810, DEFAUT DE MOTIFS, DEFAUT DE REPONSE AUX CONCLUSIONS, MANQUE DE BASE LEGALE ;\n\nATTENDU QU\'IL RESULTE DU JUGEMENT ATTAQUE QUE OTT, AU SERVICE DE LA COMPAGNIE INTERNATIONALE DES WAGONS-LITS DEPUIS LE 12 MARS 1959, ARRETA SON TRAVAIL POUR MALADIE LE 7 FEVRIER 1968 ;\n\n QU\'EXAMINE PAR LE MEDECIN DE L\'ENTREPRISE LE 7 MARS SUIVANT IL FUT DECLARE DEFINITIVEMENT INAPTE A SON EMPLOI ;\n\n QUE LE MEME JOUR LA COMPAGNIE INFORMA OTT QUE LE CONTRAT DE TRAVAIL SE TROUVAIT ROMPU ET QUE SES SERVICES PRENAIENT FIN LE 6 MARS 1968 AU SOIR ;\n\n QUE LES 15 MARS ET 9 AVRIL 1968 OTT SIGNA DEUX "RECUS POUR SOLDE DE TOUS COMPTES", L\'UN POUR LES SALAIRES ET INDEMNITES AFFERENTES A LA PERIODE DU 1ER AU 6 MARS 1968, L\'AUTRE POUR L\'INDEMNITE DE CONGEDIEMENT CALCULEE SELON LA CONVENTION COLLECTIVE APPLICABLE ;\n\n QUE OTT ASSIGNA LA COMPAGNIE EN 

## STEP 3 : Split Data

In [76]:
from sklearn.model_selection import train_test_split

print("Y shape", type(y),y.shape)
print("X shape", type(X),X.shape)
print(X.head())

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2 ,random_state=42 )

print(X_train.head())

Y shape <class 'pandas.core.series.Series'> (500,)
X shape <class 'pandas.core.series.Series'> (500,)
45070    moyen unique pris premiere branche vu les arti...
19285    premier moyen vu article code securite*_* les ...
15478    moyen unique vu article code travail .attendu ...
32738    cour cassation chambre *_* a rendu arrêt suiva...
42239    moyen unique pris trois branches attendu fait ...
Name: text, dtype: object
6554     premier moyen pris violation fausse applicatio...
49350    attendu selon les énonciations arrêt confirmat...
37708    moyen unique pris diverses branches attendu ar...
28547    vu connexité joint les pourvois ns n.n etn.n ....
33179    cour cassation chambre *_* a rendu arrêt suiva...
Name: text, dtype: object


## STEP 5 : Train the model

In [108]:
#just for testing 
mini_X_train = X_train.sample(400)
mini_y_train = y_train.sample(400)
print(mini_X_train.shape)
print(mini_y_train.shape)
mini_X_train.head()
#mini_y_train.head()

ValueError: Cannot take a larger sample than population when 'replace=False'

In [78]:
#just for testing 
mini_X_train.loc[4710]

'moyen unique pris violation articles code *_* ainsi article 7 loi avril defaut contradiction motifs manque base legale . attendu fait grief a arret attaque statuant demande rappel salaires formee dame abadie contre societe a p avoir decide faute employeur avoir fourni delai mois toutes explications toutes justifications utiles sujet mode calcul utilise indemnites dues a dame abadie les mois janvier fevrier nil devrait payer forfaitairement somme f a titre complement salaire alors dame abadie indique les bases calcul remuneration sorte cour appel operant renversement fardeau preuve a fixe equite salaire peut resulter accord parties texte legislatif reglementaire . attendu les juges fond relevent embauchee septembre surveiller clientele magasins prisunic bordeaux libourne bouscat recu employeur temoignage satisfaction dame abadie a ete congediee fevrier peu apres avoir demande fevrier verifications diminution expliquait montant salaire mois janvier . societe laisse sans reponse les crit

In [112]:
### A RAJOUTER #####
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
type(X_train_counts)
#print(X_train_counts)
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)
#print(type(X_train_transformed))
#print(X_train_transformed)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)
#print(type(X_test_transformed))
#print(X_test_transformed)

labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_labels_trf = labels.transform(y_train)


print(labels.classes_)

['COMMERCIALE' 'SOCIALE']


In [115]:
#### A RAJOUTER ####
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

linear_svc = LinearSVC()
clf = linear_svc.fit(X_train_transformed,y_train_labels_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator = linear_svc, cv = "prefit")
calibrated_svc.fit(X_train_transformed,y_train_labels_trf)
predicted = calibrated_svc.predict(X_test_transformed)
print(X_test_transformed.shape)

print("pred",predicted)
print("",y_test.shape)
print("test",y_test[0:100])

acc = accuracy_score(y_test, predicted)
print("accuracy",acc)

(100, 12114)
pred [1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1
 1 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1]
 (100,)
test [1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1
 1 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1]
accuracy 0.98


In [52]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [107]:
""" rnd_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rnd_clf.fit(mini_X_train, mini_y_train)
#y_pred = rnd_clf.predict(X_test) """

' rnd_clf = RandomForestClassifier(random_state=42, n_estimators=100)\nrnd_clf.fit(mini_X_train, mini_y_train)\n#y_pred = rnd_clf.predict(X_test) '

In [153]:
rnd_clf = RandomForestClassifier(random_state=42, n_estimators=100)

ada_clf= AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=100,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)

gdb_clf = GradientBoostingClassifier(random_state=42, n_estimators=100)

voting_clf = VotingClassifier(
    estimators=[('rfc', rnd_clf), ('abc', ada_clf), ('gbr', gdb_clf)],
    voting='hard')

In [106]:
"""for clf in (rnd_clf, ada_clf, gdb_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) """

'for clf in (rnd_clf, ada_clf, gdb_clf, voting_clf):\n    clf.fit(X_train, y_train)\n    y_pred = clf.predict(X_test)\n    print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) '