## Packages

In [3]:
from typing import List
from lxml import etree
from preTraitements.xml import get_X_Y_from_root
from preTraitements.xml import get_tree_root_from_file

### Récupération des corpus

In [5]:
tree_train, root_train = get_tree_root_from_file("./corpus/train_deft09_parlement_appr.xml/deft09_parlement_appr_fr.xml")
X_train, y_train = get_X_Y_from_root(root_train)

## Charger les données

In [21]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    "sci.crypt",
    "sci.electronics",
    "sci.med",
    "sci.space",
]

data_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
)

data_test = fetch_20newsgroups(
    subset="test",
    categories=categories,
    shuffle=True,
)

## Extraction des features 
### Nombre motset nombre phrases

In [22]:
posts = data_train.data
count_word_sent = [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]

### Vectoriser les dictionnaires
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer

In [67]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

vec.fit_transform(measurements).toarray()




vec.get_feature_names_out()

array(['city=Dubai', 'city=London', 'city=San Francisco', 'temperature'],
      dtype=object)

In [102]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
array_word_sent = v.fit_transform(count_word_sent)

### Vectorizer

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
data_vec=vectorizer.fit_transform(data_train.data)
vectorizer.get_feature_names_out()
#data_vec.toarray()

array(['00', '000', '0000', ..., 'ête', 'íålittin', 'ýé'], dtype=object)

### TF-IDF

In [81]:
from sklearn.feature_extraction.text import TfidfTransformer

### Features Union


In [117]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""
    def __init__(self, value=None):
        TransformerMixin.__init__(self)
        self.value = value
        
    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text),
                 'num_sentences': text.count('.')}
                for text in posts]


In [119]:
combined_features = FeatureUnion([("nb_word_sent", TextStats(data_train.data)),("vec",CountVectorizer())])
combined_features

## Trouver les meilleurs hyperparamètres pour chaque modèle

### Naïves Bayes

In [120]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
clf = GaussianNB()
# Do grid search over k, n_components and C:

pipeline_gaussianNB = Pipeline([("features", combined_features), ("gnb", clf)])
#("tfdif",TfidfTransformer(smooth_idf=False))
param_grid = dict(
    features__vec__ngram_range=((1, 1), (1, 2)),
    tfidf__use_idf=(True,False),
    gnb__var_smoothing=[0.001, 0.01, 0.1, 0.9, 10, 100, 1000],
)

grid_search = GridSearchCV(pipeline_gaussianNB, param_grid=param_grid, verbose=10)
grid_search.fit(data_train.data, data_train.target)
print(grid_search.best_estimator_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV 1/5; 1/28] START features__vec__ngram_range=(1, 1), gnb__var_smoothing=0.001, tfidf__use_idf=True


ValueError: Invalid parameter 'tfidf' for estimator Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('nb_word_sent',
                                                 TextStats(value=['From: '
                                                                  'al@escom.com '
                                                                  '(Al '
                                                                  'Donaldson)\n'
                                                                  'Subject: '
                                                                  'Re: Once '
                                                                  'tapped, '
                                                                  'your code '
                                                                  'is no good '
                                                                  'any more.\n'
                                                                  'Reply-To: '
                                                                  'al@escom.COM '
                                                                  '(Al '
                                                                  'Donaldson)\n'
                                                                  'Organization: '
                                                                  'ESCOM '
                                                                  'Corp., '
                                                                  'Oakton VA '
                                                                  '(USA)\n'
                                                                  'Distribution: '
                                                                  'na\n'
                                                                  'Lines: 16\n'
                                                                  '\n'
                                                                  'amolitor@nmsu.edu '
                                                                  '(Andrew '...
                                                                  '----------------------------------------------------------------------------\n'
                                                                  'Gordon '
                                                                  'Banks  '
                                                                  'N3JXP      '
                                                                  '| '
                                                                  '"Skepticism '
                                                                  'is the '
                                                                  'chastity of '
                                                                  'the '
                                                                  'intellect, '
                                                                  'and\n'
                                                                  'geb@cadre.dsl.pitt.edu   '
                                                                  '|  it is '
                                                                  'shameful to '
                                                                  'surrender '
                                                                  'it too '
                                                                  'soon." \n'
                                                                  '----------------------------------------------------------------------------\n', ...])),
                                                ('vec', CountVectorizer())])),
                ('gnb', GaussianNB())]). Valid parameters are: ['memory', 'steps', 'verbose'].

array([0, 1, 3, ..., 3, 3, 1])

### Linear SV 

## Comparer les deux modèles

In [4]:

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

iris = load_iris()

X, y = iris.data, iris.target

# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=2)

# Maybe some original features were good, too?
selection = SelectKBest(k=1)

# Build estimator from PCA and Univariate selection:

combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

# Use combined features to transform dataset:
#X_features = combined_features.fit(X, y).transform(X)
#print("Combined space has", X_features.shape[1], "features")

svm = SVC(kernel="linear")

# Do grid search over k, n_components and C:

pipeline = Pipeline([("features", combined_features), ("svm", svm)])

param_grid = dict(
    features__pca__n_components=[1, 2, 3],
    features__univ_select__k=[1, 2],
    svm__C=[0.1, 1, 10],
)

grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5; 1/18] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 1/5; 1/18] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 2/5; 1/18] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 2/5; 1/18] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 3/5; 1/18] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 3/5; 1/18] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.867 total time=   0.0s
[CV 4/5; 1/18] START features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1
[CV 4/5; 1/18] END features__pca__n_components=1, features__univ_select__k=1, svm__C=0.1;, score=0.933 total time=   0.0s
[CV 5/5; 1/18] START features__pca__n_components=1, features__univ_select__k=1, svm__C=

[CV 4/5; 14/18] END features__pca__n_components=3, features__univ_select__k=1, svm__C=1;, score=0.967 total time=   0.0s
[CV 5/5; 14/18] START features__pca__n_components=3, features__univ_select__k=1, svm__C=1
[CV 5/5; 14/18] END features__pca__n_components=3, features__univ_select__k=1, svm__C=1;, score=1.000 total time=   0.0s
[CV 1/5; 15/18] START features__pca__n_components=3, features__univ_select__k=1, svm__C=10
[CV 1/5; 15/18] END features__pca__n_components=3, features__univ_select__k=1, svm__C=10;, score=1.000 total time=   0.0s
[CV 2/5; 15/18] START features__pca__n_components=3, features__univ_select__k=1, svm__C=10
[CV 2/5; 15/18] END features__pca__n_components=3, features__univ_select__k=1, svm__C=10;, score=1.000 total time=   0.0s
[CV 3/5; 15/18] START features__pca__n_components=3, features__univ_select__k=1, svm__C=10
[CV 3/5; 15/18] END features__pca__n_components=3, features__univ_select__k=1, svm__C=10;, score=0.933 total time=   0.0s
[CV 4/5; 15/18] START feature

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_iris

pipeline = Pipeline([
    ('dim_reduction', PCA()),
    ('clf', LogisticRegression()),
])
parameters = [
    {
        'clf': (LogisticRegression(),),
        'clf__C': (0.001,0.01,0.1,1,10,100)
    }, {
        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': (10, 30),
    }
]
grid_search = GridSearchCV(pipeline, parameters)

# some example dataset
X, y = load_iris(return_X_y=True)
X_train, X_tes, y_train, y_test = train_test_split(X, y)
grid_search.fit(X_train, y_train)