In [1]:
import time
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [2]:
df_faux = pd.read_csv('Fake.csv')
df_vrai = pd.read_csv('True.csv')
df_faux['label'] = 1
df_vrai['label'] = 0

df_final = pd.concat([df_faux, df_vrai], axis=0)
df_final.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [3]:
df_final['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [4]:
df_final['all_text'] = df_final['title'] + df_final['text']

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ibrahim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def tokenize_text(c):
    tokens = nltk.word_tokenize(c)
    return [w for w in tokens if w.isalpha()] 
df_final['text_tokenized'] = df_final.apply(lambda x: tokenize_text(x['all_text']), axis=1)
df_final[['title', 'text_tokenized']].head()

Unnamed: 0,title,text_tokenized
0,Donald Trump Sends Out Embarrassing New Year’...,"[Donald, Trump, Sends, Out, Embarrassing, New,..."
1,Drunk Bragging Trump Staffer Started Russian ...,"[Drunk, Bragging, Trump, Staffer, Started, Rus..."
2,Sheriff David Clarke Becomes An Internet Joke...,"[Sheriff, David, Clarke, Becomes, An, Internet..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[Trump, Is, So, Obsessed, He, Even, Has, Obama..."
4,Pope Francis Just Called Out Donald Trump Dur...,"[Pope, Francis, Just, Called, Out, Donald, Tru..."


In [7]:
def _to_features(df_final, col):
    df_final[col] = df_final[col].replace('!', ' exclamation ')
    df_final[col] = df_final[col].replace('?', ' question ')
    df_final[col] = df_final[col].replace('\'', ' quotation ')
    df_final[col] = df_final[col].replace('\"', ' quotation ')
    return df_final[col]
df_final['all_text'] = _to_features(df_final, 'all_text')

In [8]:
nltk.download('stopwords');


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ibrahim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def remove_stopwords(tokenized_col):
    stops_words = set(stopwords.words("english"))
    return [word for word in tokenized_col if not word in stops_words]

In [10]:
df_final['_removed'] = df_final.apply(lambda x: remove_stopwords(x['text_tokenized']), axis=1)
df_final[['title', '_removed']].head()

Unnamed: 0,title,_removed
0,Donald Trump Sends Out Embarrassing New Year’...,"[Donald, Trump, Sends, Out, Embarrassing, New,..."
1,Drunk Bragging Trump Staffer Started Russian ...,"[Drunk, Bragging, Trump, Staffer, Started, Rus..."
2,Sheriff David Clarke Becomes An Internet Joke...,"[Sheriff, David, Clarke, Becomes, An, Internet..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[Trump, Is, So, Obsessed, He, Even, Has, Obama..."
4,Pope Francis Just Called Out Donald Trump Dur...,"[Pope, Francis, Just, Called, Out, Donald, Tru..."


In [11]:
def apply_stemming(tokenized_col):
    stemmer = PorterStemmer() 
    return [stemmer.stem(word).lower() for word in tokenized_col]
df_final['_stemmed'] = df_final.apply(lambda x: apply_stemming(x['_removed']), axis=1)
df_final[['title', '_stemmed']].head()

Unnamed: 0,title,_stemmed
0,Donald Trump Sends Out Embarrassing New Year’...,"[donald, trump, send, out, embarrass, new, yea..."
1,Drunk Bragging Trump Staffer Started Russian ...,"[drunk, brag, trump, staffer, start, russian, ..."
2,Sheriff David Clarke Becomes An Internet Joke...,"[sheriff, david, clark, becom, an, internet, j..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[trump, is, so, obsess, he, even, ha, obama, n..."
4,Pope Francis Just Called Out Donald Trump Dur...,"[pope, franci, just, call, out, donald, trump,..."


In [12]:
def rejoin_words(tokenized_col):
    return ( " ".join(tokenized_col))

df_final['all_text'] = df_final.apply(lambda x: rejoin_words(x['_stemmed']), axis=1)
df_final[['title', 'all_text']].head()

Unnamed: 0,title,all_text
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump send out embarrass new year eve m...
1,Drunk Bragging Trump Staffer Started Russian ...,drunk brag trump staffer start russian collus ...
2,Sheriff David Clarke Becomes An Internet Joke...,sheriff david clark becom an internet joke for...
3,Trump Is So Obsessed He Even Has Obama’s Name...,trump is so obsess he even ha obama name code ...
4,Pope Francis Just Called Out Donald Trump Dur...,pope franci just call out donald trump dure hi...


In [13]:
X = df_final['all_text']
y = df_final['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2, shuffle=True)

In [14]:
def go_inside_pipeline(X, model):
    """Retourne un pipeline pour prétraitement des données et les regrouper avec un modèle.
    
    Arguments:
        X : donnee apprentissaqge 
        model (object): scikit-learn model object
    
    Retour: 
    les  etapes du pipeline. 
    """
    
    steps_pipeline = imbpipeline(steps=[
        ("tfidf", TfidfVectorizer()),
        
        ('model', model)
    ])
    
    return steps_pipeline

In [15]:
model = XGBClassifier()
test_pipeline = go_inside_pipeline(X_train, model)
test_pipeline

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('model',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, enable_categorical=False,
                               gamma=None, gpu_id=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               predictor=None, random_state=None,
                               reg_alpha=None, reg_lambda=None,
                               scale_pos_weight=None, subsample=None,
                               tree_method=None, validate_parameters=N

In [16]:
def selectionner_le_meilleur_model(X, y, pipeline=None):
    """Teste un ensemble de  classifieurs and retourne leur performance sur la donnee d'apprentissage.
    
    Arguments:
        X (type object): un dataframe contenant donnee apprentissage. 
        y (type object): un dataframe contenant label d'apprentissage. 
        pipeline (object): Pipeline sortant de go_inside_pipeline().

    Sortie:
        df (object):un dataframe contenant les performances du model et les resul. 
    """
    all_classifiers = {}
    all_classifiers.update({"DummyClassifier": DummyClassifier(strategy='most_frequent')})
    all_classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
    all_classifiers.update({"RidgeClassifier": RidgeClassifier()})
    all_classifiers.update({"SGDClassifier": SGDClassifier()})
    all_classifiers.update({"BaggingClassifier": BaggingClassifier()})
    all_classifiers.update({"BernoulliNB": BernoulliNB()})
    all_classifiers.update({"SVC": SVC()})
    all_classifiers.update({"CatBoostClassifier":CatBoostClassifier(silent=True)})
    all_classifiers.update({"XGBClassifier": XGBClassifier(use_label_encoder=False, 
                                                       eval_metric='logloss',
                                                       objective='binary:logistic',
                                                      )})
    all_classifiers.update({"LGBMClassifier": LGBMClassifier()})
    all_classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
    all_classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
    all_classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
    all_classifiers.update({"ExtraTreesClassifier": ExtraTreeClassifier()})    
    all_classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
    #all_classifiers.update({"MLPClassifier1":MLPClassifier()})
    #all_classifiers.update({"MLPClassifier2":MLPClassifier(hidden_layer_sizes=(27, 50),
    #                                                          max_iter=300,
    #                                                          activation='relu',
    #                                                          solver='adam',
    #                                                          random_state=1)})  
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('CatBoostClassifier', CatBoostClassifier(silent=True)))
    models.append(('BaggingClassifier', BaggingClassifier()))
    all_classifiers.update({"VotingClassifier (XGBClassifier, CatBoostClassifier, BaggingClassifier)": VotingClassifier(models)})
    
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('CatBoostClassifier', CatBoostClassifier(silent=True)))
    all_classifiers.update({"VotingClassifier (XGBClassifier, CatBoostClassifier)": VotingClassifier(models)})
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('LGBMClassifier', LGBMClassifier()))
    all_classifiers.update({"VotingClassifier (XGBClassifier, LGBMClassifier)": VotingClassifier(models)})
    
    models = []
    models.append(('LGBMClassifier', LGBMClassifier()))
    models.append(('CatBoostClassifier', CatBoostClassifier(silent=True)))
    all_classifiers.update({"VotingClassifier (LGBMClassifier, CatBoostClassifier)": VotingClassifier(models)})
    models=[]
    #voting1=VotingClassifier(estimators=[('LGBMClassifier', LGBMClassifier()), 
    #                                     ('CatBoostClassifier', CatBoostClassifier(silent=True))])
    #voting2=VotingClassifier(estimators=[('XGBClassifier', XGBClassifier(use_label_encoder=False,eval_metric='logloss',
    #                                                                     objective='binary:logistic'),('LGBMClassifier',LGBMClassifier()))
    #                                     ])
    #voting3=VotingClassifier(estimators=[
        #('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss',objective='binary:logistic')),
        #('LGBMClassifier', LGBMClassifier()),
        #('CatBoostClassifier', CatBoostClassifier(silent=True))])
    #models.append(('voting1',voting1))
    #models.append(('voting2',voting2))
    #models.append(('voting3',voting3))
    #all_classifiers.update({"ensemblemodel": VotingClassifier(models)})
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('LGBMClassifier', LGBMClassifier()))
    models.append(('CatBoostClassifier', CatBoostClassifier(silent=True)))
    all_classifiers.update({"VotingClassifier (XGBClassifier, LGBMClassifier, CatBoostClassifier)": VotingClassifier(models)})
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('RandomForestClassifier', RandomForestClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    all_classifiers.update({"VotingClassifier (XGBClassifier, RandomForestClassifier, DecisionTreeClassifier)": VotingClassifier(models)})
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    all_classifiers.update({"VotingClassifier (XGBClassifier, AdaBoostClassifier, ExtraTreeClassifier)": VotingClassifier(models)})
    
    
    models = []
    models.append(('XGBClassifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    all_classifiers.update({"VotingClassifier (XGBClassifier, ExtraTreesClassifier)": VotingClassifier(models)}) 
    i=0
    
    dataframe_models = pd.DataFrame(columns=['Model', 'Time', 'Accuracy'])
    for cle in all_classifiers:
        debut = time.time()
        print(i+1)

        pipeline = go_inside_pipeline(X_train, all_classifiers[cle])
        
        cv = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
        

        row = {'Model': cle,
               'Time': format(round((time.time() -debut)/60,2)),
               'Accuracy': cv.mean(),
        }

        dataframe_models  = dataframe_models.append(row, ignore_index=True)
        
    dataframe_models  = dataframe_models.sort_values(by='Accuracy', ascending=False)
    return dataframe_models
models = selectionner_le_meilleur_model(X_train, y_train)

In [19]:
models.head(23)

Unnamed: 0,Model,Time,Accuracy
20,"VotingClassifier (XGBClassifier, RandomForestC...",18.63,0.997298
8,XGBClassifier,12.68,0.997149
19,"VotingClassifier (XGBClassifier, LGBMClassifie...",330.6,0.99703
17,"VotingClassifier (XGBClassifier, LGBMClassifier)",27.27,0.997001
9,LGBMClassifier,12.31,0.997001
21,"VotingClassifier (XGBClassifier, AdaBoostClass...",14.36,0.996822
15,"VotingClassifier (XGBClassifier, CatBoostClass...",343.68,0.996763
16,"VotingClassifier (XGBClassifier, CatBoostClass...",367.46,0.996318
4,BaggingClassifier,15.4,0.996288
18,"VotingClassifier (LGBMClassifier, CatBoostClas...",370.05,0.996288


In [18]:
group_models= []
#group_models.append(('XGBClassifier',XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')))
group_models.append(('LGBMClassifier', LGBMClassifier()))
group_models.append(('CatBoostClassifier', CatBoostClassifier(silent=True)))
#group_models.append(('BaggingClassifier', BaggingClassifier()))
stacked_model = VotingClassifier(group_models)
sortie_pipeline=features_names = go_inside_pipeline(X_train, stacked_model)
sortie_pipeline.fit(X_train, y_train)
#with open('model1.pkl', 'wb') as f:
#    pickle.dump(sortie_pipeline, f)
y_pred = sortie_pipeline.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
#f1_score = f1_score(y_test, y_pred)
print('ROC/AUC:', roc_auc)
print('Accuracy:', accuracy)
print('*****************************')
print(classification_report(y_test, y_pred))

ROC/AUC: 0.9972712792956785
Accuracy: 0.9972383073496659
*****************************
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5335
           1       1.00      1.00      1.00      5890

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

