# Dimensionnality reduction 

In [1]:
from sklearn.model_selection import cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, RFECV, RFE
from sklearn.feature_selection._base import SelectorMixin
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np 
import utils

## Annexe function

In [2]:
def sparsity_scorer(clf, X=None, y=None, scale_coef=1.0):
    """
    Evaluates the sparsity of the estimator within a pipeline
    Can work with a LogisticRegression or a pipeline containing
    a LogisticRegression

    Parameters:
    -----------
        clf: a LogisticRegression or a Pipeline
        X: ndarray (unused)
        y: ndarray (unused)

    Returns
    -------
        sparsity: float in [0., 1.]
                  0. means all the dimensions are used,
                  1. means no dimension are used
    """
    if isinstance(clf, Pipeline):
        clf_item = None
        coef = scale_coef
        for k, v in clf.named_steps.items():
            if isinstance(v, SelectorMixin):
                coef *= v.get_support().sum() / v.get_support().size
            elif isinstance(v, LogisticRegression):
                clf_item = v
        if clf_item is None:
            raise RuntimeError("Cannot estimate the sparsity of the pipeline")
        else:
            return sparsity_scorer(clf_item, X, y, coef)
    elif isinstance(clf, LogisticRegression):
        non_null_coefs = clf.coef_.ravel() != 0
        sparsity = 1.0 - non_null_coefs.sum() / non_null_coefs.size * scale_coef
        return sparsity
    else:
        raise RuntimeError(f"Cannot estimate the sparsity of a {type(clf)}")


class LinearPipeline(BaseEstimator):
    def __init__(self, pipeline, clf_key):
        super().__init__()
        self.pipeline = pipeline
        self.clf_key = clf_key

    def fit(self, X, y=None):
        return self.pipeline.fit(X, y)

    def transform(self, X):
        return self.pipeline.transform(X)

    def score(self, X, y):
        return self.pipeline.score(X, y)

    @property
    def coef_(self):
        return self.pipeline.named_steps[self.clf_key].coef_

## Easy filter template

In [None]:
def simple_model(ngram_range=(1, 2), min_df=2, num_folds=4, num_jobs=-1):
    """
    A simple baseline model with bag-of-words on n-grams and
    a logistic regression
    """

    print("===> [BEGIN] Simple model")
    
    traindata, unsupdata, testdata = "data_train","data_unsup","data_test"; #load the data you need ! 
    pipe = Pipeline([('vectorizer', CountVectorizer(ngram_range=ngram_range,min_df=min_df)),
        ('Max_abs', MaxAbsScaler()),
        ('Logistic_reg', LogisticRegression(solver="liblinear")),
        ])
    scores = cross_validate(pipe, traindata.data, traindata.target, cv=3,return_train_score=True)
    pipe.fit(traindata.data,traindata.target)
    print(scores['test_score'])

    print(pipe.score(testdata.data, testdata.target))
    vectorizer = pipe["vectorizer"]
    print("Saving the vocabulary in words_simple.txt")
    vocabulary = vectorizer.get_feature_names_out()

    print("===> [END] Simple model")

## Filter 

In [None]:
def imdb_filter(num_features, C, ngram_range, min_df, num_jobs=-1):
    """
    Univariate filtering
    """
    print("===> [BEGIN] Filter")
    traindata, unsupdata, testdata = "data_train","data_unsup","data_test"; #load the data you need !
    chiiiiii_deux = chi2
    pipe = Pipeline([('vectorizer', CountVectorizer(ngram_range=ngram_range,min_df=min_df,binary=True)),
        ("filter", SelectKBest(chiiiiii_deux, k=num_features)),
        ('Max_abs', MaxAbsScaler()),
        ('Logistic_reg', LogisticRegression(solver="liblinear")),
        ])
    pipe.fit(traindata.data,traindata.target)

    acc_train = pipe.score(traindata.data, traindata.target)
    acc_test = pipe.score(testdata.data, testdata.target)
    
    print("acc_train= ", acc_train,"acc_test = ",acc_test)
    
    scores = cross_val_score(pipe,
                             traindata.data, traindata.target,
                             n_jobs=-1,
                             verbose=0)
    print(f"Real risk by {scores.size}-fold CV : {scores.mean():.2} (+/- {scores.std():.2})")
    '''
    Reminder : chi2 teste une hypothèse H0 et la valide ou non dans notre cas!
    Ici on veut savoir si le mot permet de faire une bonne prédiction sur un message positif ou négatif
    Si le mot donne de bonne prédiction alors on va le garder avec la fonction Select Kbest from sktlearn
    Cela permet alors de réduir la dimension de notre entrée (nombres de features)
    '''
    counter = pipe["vectorizer"]
    chi2_filter = pipe["filter"]
    
    selected_dims = chi2_filter.get_support() # Obteient un array avec True ou False pour chaque features 
    selected_scores = chi2_filter.scores_[selected_dims] # Ne garde que les Features où se situe True !
    
    sorted_idx = np.argsort(selected_scores)
    selected_terms = np.array(counter.get_feature_names_out())[selected_dims] # Retrouve les mots qui ont été sélectionnés 

    print(selected_terms)
    print("===> [Done] Filter")


## Embeded 

In [None]:
def imdb_embedded(ngram_range=(1, 2), min_df=2, C=0.5, n_folds=5, num_jobs=-1):
    """
    LogisticRegression with L1 penalty
    """
    print("===> [BEGIN] Embedded")
    traindata, unsupdata, testdata = "data_train","data_unsup","data_test"; #load the data you need !
    pipe = Pipeline([('vectorizer', CountVectorizer(ngram_range=ngram_range,min_df=min_df,binary=True)),
        ('Max_abs', MaxAbsScaler()),
        ('Logistic_reg', LogisticRegression(C=C,penalty="l1",solver="liblinear")),
        ])
    print("Fitting the classifier")
    pipe.fit(traindata.data, traindata.target)

    ''' 
     Also, we can introduce a measure of sparsity of your model as the fraction of 
     dimensions you kept from the original vocabulary and which can be computed 
     with the code we provide by calling the sparsity_scorer function.
    '''
    sparsity = sparsity_scorer(pipe)

    
    print(f"Sparsity (fraction of zeros) : {100*sparsity:.2f}%")

    acc_train = pipe.score(traindata.data, traindata.target)
    acc_test = pipe.score(testdata.data, testdata.target)
    print(
        f"""Train acc : {100*acc_train:.2f}%
                  Test acc : {100*acc_test:.2f}%"""
    )
    # Extract and save the selected vocabulary
    vocabulary = np.array(pipe.named_steps["vectorizer"].get_feature_names_out())
    selected_dims = pipe.named_steps["clf"].coef_.ravel() != 0
    selected_terms = vocabulary[selected_dims]
    weights = pipe.named_steps["Logistic_reg"].coef_.ravel()[selected_dims]
    sorted_idx = np.argsort(weights)

    print(f"Original vocabulary size : {len(vocabulary)}")
    print(f"Selected vocabulary size : {len(weights)}")
    print(selected_terms[sorted_idx], weights[sorted_idx])
    print("===> [Done] Embedded")

## Warper

In [None]:
def imdb_wrapper(num_features=10000, step=10000, ngram=2, num_jobs=-1):
    """
    Recursive feature elimination with logistic regression as estimator
    """
    print("===> [BEGIN] imdb_wrapper")
    cache_name = "imdb_wrapper.pkz"
    try:
        X_train, y_train, X_test, y_test, vocabulary = utils.load_cache(
            cache_name, ["X_train", "y_train", "X_test", "y_test", "vocabulary"]
        )
    except RuntimeError as err:
        traindata, _, testdata = preprocess_imdb(num_jobs=num_jobs)

        print("Vectorizing the data")
        vectorizer = CountVectorizer(ngram_range=(1, ngram), min_df=2)
        X_train = vectorizer.fit_transform(traindata.data)
        y_train = traindata.target
        X_test = vectorizer.transform(testdata.data)
        y_test = testdata.target
        vocabulary = np.array(vectorizer.get_feature_names_out())

        utils.save_cache(
            cache_name,
            {
                "X_train": X_train,
                "y_train": y_train,
                "X_test": X_test,
                "y_test": y_test,
                "vocabulary": vocabulary,
            },
        )
    print(f"Original vocabulary size : {len(vocabulary)}")

    classifier = Pipeline(
        [("scaler", MaxAbsScaler()), ("clf", LogisticRegression(solver="liblinear"))]
    )
    classifier = LinearPipeline(classifier, "clf")

    selector = RFE(classifier, n_features_to_select=num_features, step=step, verbose=1)
    print("Performing the recursive feature elimination")
    selector.fit(X_train, y_train)

    # Compute its metrics
    acc_train = selector.score(X_train, y_train)
    acc_test = selector.score(X_test, y_test)
    print(
        f"""Train acc : {100*acc_train:.2f}%
                  Test acc : {100*acc_test:.2f}%"""
    )

    selected_dims = selector.get_support()
    selected_terms = vocabulary[selected_dims]
    weights = selector.estimator_.pipeline.named_steps["clf"].coef_.ravel()
    sorted_idx = np.argsort(weights)

    print(f"Original vocabulary size : {len(vocabulary)}")
    print(f"Selected vocabulary size : {len(weights)}")
    # ^^^^^^^^^
    print("===> [END] imdb_wrapper")