# Projet Data mining 
## Text mining et NLP
## Université Paris Saclay
## Master Innovation, Marchés et Science des Données
### Promotion 2019-2020

# Importation des packages

In [1]:
import random
import pandas as pd
import numpy as np
import re
import string

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report,confusion_matrix, accuracy_score
from sklearn.preprocessing import FunctionTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn import pipeline

from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection.univariate_selection import chi2, SelectKBest
from sklearn.externals import joblib

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier 
from sklearn import preprocessing 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV


  from numpy.core.umath_tests import inner1d


In [3]:
#Initialiser des stopwords
np.random.seed(500)
stopWords = set(stopwords.words('english'))
stopwords_forCloud = set(STOPWORDS)
stopwords_forCloud.update(['flight', 'flights', 'Flightled', 'AmericanAir', 'VirginAmerica'])

# Importation du datasets

In [5]:

airline_kaggle = pd.read_csv('.\\Tweets_kaggle_processed.csv')
airline_kaggle.head(5)

Unnamed: 0.1,Unnamed: 0,airline_sentiment,airline,negativereason,text,label,label_doc2vec,processed_text,processed_text_length
0,0,neutral,Virgin America,,@VirginAmerica What @dhepburn said.,0,1,what say,2
1,1,positive,Virgin America,,@VirginAmerica plus you've added commercials t...,1,2,plus youve added commercial to the experience ...,8
2,2,neutral,Virgin America,,@VirginAmerica I didn't today... Must mean I n...,0,1,i didnt today must mean i need to take another...,11
3,3,negative,Virgin America,Bad Flight,@VirginAmerica it's really aggressive to blast...,-1,0,it really aggressive to blast obnoxious entert...,16
4,4,negative,Virgin America,Can't Tell,@VirginAmerica and it's a really big bad thing...,-1,0,and it a really big bad thing about it,9


In [24]:
airline_scrappe = pd.read_csv(".\\Tweet_scrape_processed.csv")
airline_scrappe = airline_scrappe.loc[:,["text","processed_text"]]
airline_scrappe = airline_scrappe.dropna()
airline_scrappe.head(5)

Unnamed: 0,text,processed_text
0,We hear you and understand the concerns you ha...,we hear you and understand the concern you hav...
1,Thank you for helping @united!,thank you for help
2,I filed a claim with United. As I out in my mi...,i file a claim with united a i out in my milea...
3,@msdunn_says,say
4,Delta is waiving change fee on tickets booked ...,delta be waive change fee on ticket book befor...


# Sélection des colonnes qui seront utilisées pour l'analyse

In [7]:

airline_sub = airline_kaggle.loc[:, ['airline_sentiment', 'airline', 'negativereason', 'processed_text']]
airline_sub.head(10)   

Unnamed: 0,airline_sentiment,airline,negativereason,processed_text
0,neutral,Virgin America,,what say
1,positive,Virgin America,,plus youve added commercial to the experience ...
2,neutral,Virgin America,,i didnt today must mean i need to take another...
3,negative,Virgin America,Bad Flight,it really aggressive to blast obnoxious entert...
4,negative,Virgin America,Can't Tell,and it a really big bad thing about it
5,negative,Virgin America,Can't Tell,seriously would pay a flight for seat that did...
6,positive,Virgin America,,yes nearly every time i fly vx this ear worm w...
7,neutral,Virgin America,,really miss a prime opportunity for men withou...
8,positive,Virgin America,,well i didnt but now i do d
9,positive,Virgin America,,it be amaze and arrive an hour early youre too...


# Fonction pour la création des graphiques de nuage de mots

In [12]:

def show_wordcloud(data):
    wordcloud = WordCloud(
        background_color = 'white',
        #contour_width = 3, 
        #contour_color = 'steelblue',
        stopwords = stopwords_forCloud,
        max_words = 80,
        max_font_size = 50, 
        scale = 3,
        random_state = 1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    plt.figure(1, figsize = (10, 12))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.show()

# Fonction pour lemmatiser les texts 

In [13]:

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def lemmatize_text(text):    
    lmtzr = WordNetLemmatizer().lemmatize
    text = word_tokenize(str(text))   # Init the Wordnet Lemmatizer    
    word_pos = pos_tag(text)    
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]
    return (' '.join(lemm_words))

# Fonction de nettoyage de données

In [14]:

def pre_process(text):       
    emoji_pattern = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)    

    text = emoji_pattern.sub(r'', text)                       # remove emojis       
    text = text.lower()                                       # lower all letters   
    text = re.sub(r'@[A-Za-z0-9]+', '', text)                 # remove user mentions such as @VirginAmerica    
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)       # remove URL

    text = ''.join([t for t in text if t not in string.punctuation])   # remove punctuations       
    text = ''.join([t for t in text if not t.isdigit()])   # remove numeric digits     
    text = re.sub("[^a-zA-Z0-9]", " ", text)   # only leave letters
    text = lemmatize_text(text)   # use Wordnet(lexical database) to lemmatize text     
    return text

##Test one of the messages
pre_process(airline_sub["processed_text"][89])

'why be the site down when will it be back up'

# Créaction de la colonne label qui est la colonne cible en le valorisant -1,0,1 qui représentent Negative: -1, neutral: 0, positive 1

In [4]:
#créer la colonne label qui est la colonne cible en le valorisant -1,0,1 qui représentent Negative: -1, neutral: 0, positive 1
sentiment_count = airline_sub['airline_sentiment'].value_counts()  # negative 9178, neutral 3099, positive 2363
print("airline_sub sentiment_count: ", sentiment_count)

airline_sub['label'] = airline_sub['airline_sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1}) 

#airline_sub['processed_text'] =  airline_sub['text'].apply(pre_process)  
#airline_sub['processed_text_length'] = airline_sub['processed_text'].apply(lambda x: len(word_tokenize(x)))

airline_sub.head()

airline_sub sentiment_count:  negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


Unnamed: 0,airline_sentiment,airline,negativereason,processed_text,label
0,neutral,Virgin America,,what say,0
1,positive,Virgin America,,plus youve added commercial to the experience ...,1
2,neutral,Virgin America,,i didnt today must mean i need to take another...,0
3,negative,Virgin America,Bad Flight,it really aggressive to blast obnoxious entert...,-1
4,negative,Virgin America,Can't Tell,and it a really big bad thing about it,-1


# Fonction pour la visualisation des données

In [15]:
#La fonction de visualiser nos données
def Airline_Tweet_Visualization():
    
    # numbers of neutral, positive and neguative reviews.
    sns.set(style="darkgrid")
    sns.countplot(x = 'airline_sentiment', data = airline_sub, order = airline_sub['airline_sentiment'].value_counts().index, palette = 'Set1')
    plt.xlabel('Sentiment')
    plt.ylabel('Frequency')
    plt.show()
    
    # numbers of each airline for the frequency of sentiment
    sns.set(style="darkgrid")
    sns.countplot(x = 'airline', data = airline_sub, hue = 'airline_sentiment', order = airline_sub['airline'].value_counts().index, palette = 'Set2')
    plt.xlabel('Airline')
    plt.ylabel('Frequency')
    plt.legend().set_title('Sentiment')
    plt.show()
            
    # frequency of the different negative reasons          
    sns.set(style="darkgrid")
    sns.countplot(y = 'negativereason', data = airline_sub, order = airline_sub['negativereason'].value_counts().index, palette = 'Set3')
    plt.xlabel('Frequency')
    plt.ylabel('Negative Reason')
    plt.show()
    
    # distribution of negative reasons on each airlines
    plt.figure(figsize=(12, 6))
    sns.countplot(x = 'airline', data = airline_sub, hue = 'negativereason', palette = 'Set3', saturation = True)
    plt.xlabel('Airline companies')
    plt.ylabel('Frequency')
    plt.legend(bbox_to_anchor = (1.01, 1), loc = 2, borderaxespad = 0.1)
    plt.show() 
    
    # review length of distribution over neutral, positive and negative sentiment
    sns.boxplot(x = 'airline_sentiment', y = 'processed_text_length', data = airline_sub)    
    plt.xlabel('Sentiment')
    plt.ylabel('Processed Text Length')
    plt.ylim(0, 50)
    plt.show()   
    
    # wordcloud for negative, neutral and positive tweets
    airline_sub_neg = airline_sub.loc[airline_sub['airline_sentiment'] == 'negative']   
    airline_sub_neu = airline_sub.loc[airline_sub['airline_sentiment'] == 'neutral']    
    airline_sub_pos = airline_sub.loc[airline_sub['airline_sentiment'] == 'positive']   

    # show world cloud with negative sentiment
    print('========================world cloud with negative sentiment==================================')
    show_wordcloud(airline_sub_neg['processed_text'])   
    # show world cloud with neutral sentiment
    print('========================world cloud with neutral sentiment==================================')
    show_wordcloud(airline_sub_neu['processed_text'])     
    # show world cloud with positive sentiment  
    print('========================world cloud with positive sentiment==================================')
    show_wordcloud(airline_sub_pos['processed_text'])     
  

# rééchatillonnage des données (sur-échantillonnage et sous-échatillonage)

In [16]:
def oversampling(train_X):
    df_major_neg = train_X[train_X['label'] == -1]
    df_minor_neu = train_X[train_X['label'] == 0]
    df_minor_pos = train_X[train_X['label'] == 1]        
    major_count = len(df_major_neg)
 
    # oversample minority class
    df_minor_neu_oversampled = resample(df_minor_neu, 
                                 replace = True,              # sample with replacement
                                 n_samples = major_count,     # to match majority class 
                                 random_state = 1000)    

    df_minor_pos_oversampled = resample(df_minor_pos, 
                                 replace = True,             
                                 n_samples = major_count,   
                                 random_state = 1000)      
         
    trainX = pd.concat([df_major_neg, df_minor_neu_oversampled, df_minor_pos_oversampled])   # Combine majority class with oversampled minority class
    print("Train dataset calss distribution: \n", trainX.label.value_counts())
    trainX = shuffle(trainX, random_state = 200) 
    return trainX

def undersampling(train_X):
    df_major_neg = train_X[train_X['label'] == -1]
    df_minor_neu = train_X[train_X['label'] == 0]
    df_minor_pos = train_X[train_X['label'] == 1]        
    minor_count = len(df_minor_pos)
 
    # undersample minority class
    df_major_neg_undersampled = resample(df_major_neg, 
                                 replace = True,              # sample with replacement
                                 n_samples = minor_count,     # to match minority class
                                 random_state = 1000)    

    df_minor_neu_undersampled = resample(df_minor_neu, 
                                 replace = True,             
                                 n_samples = minor_count,   
                                 random_state = 1000)      
         
    trainX = pd.concat([df_major_neg_undersampled, df_minor_neu_undersampled, df_minor_pos])   # Combine majority class with oversampled minority class
    print("Train dataset calss distribution: \n", trainX.label.value_counts())
    trainX = shuffle(trainX, random_state = 200) 
    return trainX

# Naive Bayes

In [19]:
def airline_NB(df, feature, ngram, sample_method):    
    random.seed(999)
        
    if feature == "TF":
        vector = CountVectorizer(analyzer = 'word', ngram_range=(1, ngram)) 
    elif feature == "TFIDF":        
        vector = TfidfVectorizer(analyzer = 'word', ngram_range=(1, ngram))

    train_X, test_X, train_y, test_y = train_test_split(df, df['label'], test_size = 0.3, random_state = 222)
        
    if sample_method == "undersampling":
        train_X = undersampling(train_X)
    
    elif sample_method == "oversampling":    
        train_X = oversampling(train_X)   
          
    pipe = make_pipeline(vector, MultinomialNB(alpha = 1.0, fit_prior = True))
    pipe.fit(train_X['processed_text'], train_X['label'])          
    
    test_y_hat = pipe.predict(test_X['processed_text'])
    
    df_result = test_X.copy()
    df_result['prediction'] = test_y_hat.tolist()  
    
    df_prob = pd.DataFrame(pipe.predict_proba(test_X['processed_text']), columns = pipe.classes_)
    df_prob.index = df_result.index
    df_prob.columns = ['probability_negative', 'Probability_neutral', 'probability_positive']

    df_final = pd.concat([df_result, df_prob], axis = 1)
    
    #file_name = 'NB_' + str(ngram) + '_' + sample_method 
    #df_final.to_csv(file_name + '.csv')       
    
    print("-----------------------------------------")
    print("NB classification report -- ", "feature: %s/" %feature, "ngram: %d/" %ngram, "sample_method: %s/" %sample_method)
    print(pd.crosstab(test_y.ravel(), test_y_hat, rownames = ['True'], colnames = ['Predicted'], margins = True))      

    print("-----------------------------------------")
    print(classification_report(test_y, test_y_hat))    
    print('Macro F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'macro')))  
    print('Weighted F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'weighted')))    
    joblib.dump(pipe, "NB_"+"train_model.m")

# SVM

In [20]:
def airline_SVM(df, feature, ngram, sample_method):    
    random.seed(888)
        
    if feature == "TF":
        vector = CountVectorizer(analyzer = 'word', ngram_range=(1, ngram)) 
    elif feature == "TFIDF":        
        vector = TfidfVectorizer(analyzer = 'word', ngram_range=(1, ngram))
    
    train_X, test_X, train_y, test_y = train_test_split(df, df['label'], test_size = 0.2, random_state = 123)

    if sample_method == "undersampling":
        train_X = undersampling(train_X)
    
    elif sample_method == "oversampling":    
        train_X = oversampling(train_X)               
 
    pipe = make_pipeline(vector, svm.SVC(kernel = 'linear', probability = True, random_state = 123))
    pipe.fit(train_X['processed_text'], train_X['label'])     
    
    test_y_hat = pipe.predict(test_X['processed_text'])
        
    df_result = test_X.copy()
    df_result['prediction'] = test_y_hat.tolist()   
    
    df_prob = pd.DataFrame(pipe.predict_proba(test_X['processed_text']), columns = pipe.classes_)
    df_prob.index = df_result.index
    df_prob.columns = ['probability_negative', 'Probability_neutral', 'probability_positive']

    df_final = pd.concat([df_result, df_prob], axis = 1)
    
#     file_name = 'SVM_' + str(ngram) + '_' + sample_method 
#     df_final.to_csv(file_name + '.csv')       
    
    print("-----------------------------------------")
    print("SVM classification report -- ", "feature: %s/" %feature, "ngram: %d/" %ngram, "sample_method: %s/" %sample_method)
    print(pd.crosstab(test_y.ravel(), test_y_hat, rownames = ['True'], colnames = ['Predicted'], margins = True))  
    
    print("-----------------------------------------")
    print(classification_report(test_y, test_y_hat))
    
    print('Macro F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'macro')))  
    print('Weighted F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'weighted')))
    joblib.dump(pipe, "SVM_"+"train_model.m")

# Random Forest

In [21]:
def airline_random_forest(df, feature, ngram, sample_method):    
    random.seed(888)
        
    if feature == "TF":
        vector = CountVectorizer(analyzer = 'word', ngram_range=(1, ngram)) 
    elif feature == "TFIDF":        
        vector = TfidfVectorizer(analyzer = 'word', ngram_range=(1, ngram))
    
    train_X, test_X, train_y, test_y = train_test_split(df, df['label'], test_size = 0.2, random_state = 123)

    if sample_method == "undersampling":
        train_X = undersampling(train_X)
    
    elif sample_method == "oversampling":    
        train_X = oversampling(train_X)               
 
    pipe = make_pipeline(vector, RandomForestClassifier(n_estimators=100, random_state=0))
    pipe.fit(train_X['processed_text'], train_X['label'])     
    
    test_y_hat = pipe.predict(test_X['processed_text'])
        
    df_result = test_X.copy()
    df_result['prediction'] = test_y_hat.tolist()   
    
    df_prob = pd.DataFrame(pipe.predict_proba(test_X['processed_text']), columns = pipe.classes_)
    df_prob.index = df_result.index
    df_prob.columns = ['probability_negative', 'Probability_neutral', 'probability_positive']

    df_final = pd.concat([df_result, df_prob], axis = 1)
    
    print("-----------------------------------------")
    print("random forest -- ", "feature: %s/" %feature, "ngram: %d/" %ngram, "sample_method: %s/" %sample_method)
    print(pd.crosstab(test_y.ravel(), test_y_hat, rownames = ['True'], colnames = ['Predicted'], margins = True))  
    
    print("-----------------------------------------")
    print(classification_report(test_y, test_y_hat))
    
    print('Macro F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'macro')))  

    print('Weighted F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'weighted'))) 
    joblib.dump(pipe, "RF_"+"train_model.m")

# MLP

In [22]:
def MLP(df, feature, ngram, sample_method):    
    random.seed(888)
        
    if feature == "TF":
        vector = CountVectorizer(analyzer = 'word', ngram_range=(1, ngram)) 
    elif feature == "TFIDF":        
        vector = TfidfVectorizer(analyzer = 'word', ngram_range=(1, ngram))
    
    train_X, test_X, train_y, test_y = train_test_split(df, df['label'], test_size = 0.2, random_state = 123)

    if sample_method == "undersampling":
        train_X = undersampling(train_X)
    
    elif sample_method == "oversampling":    
        train_X = oversampling(train_X)               
 
    pipe = make_pipeline(vector, MLPClassifier( solver='lbfgs', random_state = 0,hidden_layer_sizes=[100]))
    pipe.fit(train_X['processed_text'], train_X['label'])     
    
    test_y_hat = pipe.predict(test_X['processed_text'])
        
    df_result = test_X.copy()
    df_result['prediction'] = test_y_hat.tolist()   
    
    df_prob = pd.DataFrame(pipe.predict_proba(test_X['processed_text']), columns = pipe.classes_)
    df_prob.index = df_result.index
    df_prob.columns = ['probability_negative', 'Probability_neutral', 'probability_positive']

    df_final = pd.concat([df_result, df_prob], axis = 1)
    
     
    
    print("-----------------------------------------")
    print("MLP classification report -- ", "feature: %s/" %feature, "ngram: %d/" %ngram, "sample_method: %s/" %sample_method)
    print(pd.crosstab(test_y.ravel(), test_y_hat, rownames = ['True'], colnames = ['Predicted'], margins = True))  
    
    print("-----------------------------------------")
    print(classification_report(test_y, test_y_hat))
    
    print('Macro F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'macro')))  
    print('Weighted F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'weighted'))) 
    joblib.dump(pipe, "mlp_"+"train_model.m")

In [23]:
def MLP(df, feature, ngram, sample_method):    
    random.seed(888)
        
    if feature == "TF":
        vector = CountVectorizer(analyzer = 'word', ngram_range=(1, ngram)) 
    elif feature == "TFIDF":        
        vector = TfidfVectorizer(analyzer = 'word', ngram_range=(1, ngram))
    
    train_X, test_X, train_y, test_y = train_test_split(df, df['label'], test_size = 0.2, random_state = 123)

    if sample_method == "undersampling":
        train_X = undersampling(train_X)
    
    elif sample_method == "oversampling":    
        train_X = oversampling(train_X)               
 
    pipe = make_pipeline(vector, MLPClassifier( solver='lbfgs',activation='tanh', random_state = 0,hidden_layer_sizes=[100]))
    pipe.fit(train_X['processed_text'], train_X['label'])    
    
    test_y_hat = pipe.predict(test_X['processed_text'])
        
    df_result = test_X.copy()
    df_result['prediction'] = test_y_hat.tolist()   
    
    df_prob = pd.DataFrame(pipe.predict_proba(test_X['processed_text']), columns = pipe.classes_)
    df_prob.index = df_result.index
    df_prob.columns = ['probability_negative', 'Probability_neutral', 'probability_positive']

    df_final = pd.concat([df_result, df_prob], axis = 1)
    
#     file_name = 'SVM_' + str(ngram) + '_' + sample_method 
#     df_final.to_csv(file_name + '.csv')       
    
    print("-----------------------------------------")
    print("MLP classification report -- ", "feature: %s/" %feature, "ngram: %d/" %ngram, "sample_method: %s/" %sample_method)
    print(pd.crosstab(test_y.ravel(), test_y_hat, rownames = ['True'], colnames = ['Predicted'], margins = True))  
    
    print("-----------------------------------------")
    print(classification_report(test_y, test_y_hat))
    
    print('Macro F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'macro')))  
    print('Weighted F1 Score: {:.2f}'.format(f1_score(test_y_hat, test_y, average = 'weighted'))) 
    joblib.dump(pipe, "mlp_"+"train_model.m")

In [15]:
def main():    
    
    airline_model = airline_kaggle.loc[:, ['processed_text', 'label']]
    #airline_model.to_csv('airline_model.csv')  
            
    # Naive Bayes. Arguments: dataframe, TF/IFIDF, unigran or ngram, data-balancing method   
    
    for i in range(1,4):
        airline_NB(airline_model, "TFIDF", i, "none")  
        airline_NB(airline_model, "TFIDF", i, "oversampling")  
        airline_NB(airline_model, "TFIDF", i, "undersampling")    
    
    # SVM. Arguments: dataframe, TF/IFIDF, unigran or ngram, data-balancing method    
    for i in range(1,4):
        airline_SVM(airline_model, "TFIDF", i, "none")  
        airline_SVM(airline_model, "TFIDF", i, "oversampling")  
        airline_SVM(airline_model, "TFIDF", i, "undersampling")  
        
    # RandomForest : Arguments: dataframe, TF/IFIDF, unigran or ngram, data-balancing method
    for i in range(1,4):
        airline_random_forest(airline_model, "TFIDF", i, "none")  
        airline_random_forest(airline_model, "TFIDF", i, "oversampling")  
        airline_random_forest(airline_model, "TFIDF",i, "undersampling") 
    
    # MLP : : Arguments: dataframe, TF/IFIDF, unigran or ngram, data-balancing method
    for i in range(1,4):
        MLP(airline_model, "TFIDF", i, "none")  
        MLP(airline_model, "TFIDF", i, "oversampling")  
        MLP(airline_model, "TFIDF",i, "undersampling") 
    

In [None]:
#Appliquer le modèle obtenu, faire la prédiction sur le dataset de scrappe et stock le résultat en CSV
clf = joblib.load("train_model.m")
airline_scrappe["label"] = clf.predict(airline_scrappe["processed_text"])
airline_scrappe.to_csv("airline_scrappe_avec_label.csv")