In [1]:
import pandas as pd
import os

In [5]:
NB_DATA_140 = 1000

In [6]:
def data(stop_words, lemmatization, negation):
    """
    Chargement de la base Sentiment140 et des tweets webscrapés
    """
    file = "train"
    if stop_words:
        file += "_stop"
    if lemmatization:
        file += "_lemm"
    if negation:
        file += "_neg"
    df_140 = pd.read_pickle(os.path.join("data", "sentiment140", file + ".bz2")).sample(NB_DATA_140, random_state=1234).reset_index(drop=True)

    file = "web"
    if stop_words:
        file += "_stop"
    if lemmatization:
        file += "_lemm"
    if negation:
        file += "_neg"
    df_web = pd.read_pickle(os.path.join("data", "web", file + ".bz2"))

    X_140 = df_140.text.to_list()
    y_140 = df_140.sentiment.to_list()

    X_web = df_web.Text.to_list()
    
    return X_140, y_140, X_web

In [None]:
class Modelisation():
    def __init__(self, X_labeled, y_labeled, X_unlabeled, X_unlabeled_cat, y_unlabeled_cat, vectorizer, model, scaling=True):
        vectorizer.fit(X_labeled + X_unlabeled)
        X_labeled = vectorizer.transform(X_labeled)

        if scaling:
            scaler = StandardScaler(with_mean=False)
            X_labeled = scaler.fit_transform(X_labeled)

        X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, train_size=0.80, random_state=1234)

        model.fit(X_train_labeled, y_train_labeled)    
        y_pred_labeled = model.predict(X_test_labeled)

        X_unlabeled_cat = vectorizer.transform(X_unlabeled_cat)
        if scaling:
            X_unlabeled_cat = scaler.transform(X_unlabeled_cat)
        y_pred_unlabeled_cat = model.predict(X_unlabeled_cat)
        
        self.X_test_labeled = X_test_labeled
        self.y_test_labeled = y_test_labeled
        self.y_pred_labeled = y_pred_labeled
        self.X_unlabeled_cat = X_unlabeled_cat
        self.y_unlabeled_cat = y_unlabeled_cat
        self.y_pred_unlabeled_cat = y_pred_unlabeled_cat
        self.vectorizer = vectorizer
        self.model = model
        self.scaling = scaling
        
        if scaling:
            self.scaler = scaler
    
    def get_data(self):
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def show_conf_matrix(self, X_test, y_test, y_pred):
        metrics.plot_confusion_matrix(self.model, X_test, y_test, cmap='Blues')
        plt.show()
        
        sc_accuracy = metrics.accuracy_score(y_pred, y_test)    
        sc_balanced_accuracy = metrics.balanced_accuracy_score(y_pred, y_test)        
        sc_roc_auc = metrics.roc_auc_score(y_test, self.model.predict_proba(X_test)[:, 1])        

        print(f"Accuracy : {sc_accuracy:.4f}")
        print(f"Balanced accuracy : {sc_balanced_accuracy:.4f}")
        print(f"ROC AUC : {sc_roc_auc:.4f}")
        
    def show_conf_matrix_labeled(self):
        print("\nDonnées labellisées de test (Sentiment140)")
        self.show_conf_matrix(self.X_test_labeled, self.y_test_labeled, self.y_pred_labeled)
        
    def show_conf_matrix_unlabeled(self):
        print("\nDonnées non labellisées (Webscraping labellisé à la main)")
        self.show_conf_matrix(self.X_unlabeled_cat, self.y_unlabeled_cat, self.y_pred_unlabeled_cat)
    
    def predict(self, X):
        X = self.vectorizer.transform(X)
        if self.scaling:
            X = self.scaler.transform(X)
        return self.model.predict(X)