In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import feature_extraction, model_selection, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import glob 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import random
from string import punctuation
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.update(list(punctuation))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Some parts of progressbar from tqdm will not be seen on github, it will look like -> HBox(children=......)

In [0]:
from tqdm.auto import tqdm
tqdm.pandas()

Get Datasets

In [0]:
def get_datasets(shuffle=False):
    df = pd.DataFrame()
    for file in tqdm(glob.glob('Datasets/*/*_*.csv')):
        df = df.append(pd.read_csv(file), ignore_index=True)
    if shuffle:
    	#df = df.sample(frac=1).reset_index(drop=True)
    	df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
    return df

As seen in Analysis.ipynb, there are more 0's than 1's So normalize keeping some difference

In [0]:
def normalize(df,difference=300):
    df_list = [df[df['fake']==0],df[df['fake']==1]] # [fact,fake]
    lst = [len(df_list[0]),len(df_list[1])]
    if not bool(lst.index(min(lst))):
        """ if fake is larger than swap """
        df_list[0], df_list[1] = df_list[1], df_list[0]
    size = len(df_list[0]) - len(df_list[1]) + difference
    to_delete = random.sample(range(0, len(df_list[0])), size)
    df_list[0] = df_list[0].drop(df_list[0].index[to_delete])
    #print("FACT: {} FAKE:{} Difference:{}".format(len(df_list[0]),len(df_list[1]),len(df_list[1])-len(df_list[0])))
    return df_list[0].append(df_list[1], ignore_index=True).sample(frac=1)

Cleaning Text, removing html tags, url's, digits stopwords, them lemmatizing them

In [0]:
def text_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = " ".join(x for x in word_tokenize(str(text)) if x.strip().lower() not in stop)
    text = re.sub(r'\d+', '', text)
    text = " ".join(lemmatizer.lemmatize(x.lower()) for x in text.split())
    return(text)

In [0]:
def pre_process():
    df = normalize(get_datasets(shuffle=True))
    print("Normalized")
    df = df.replace(np.nan, '', regex=True)
    df['news'] = df['title'].str.cat(df['text'],sep=" ")
    print("Cleaning")
    df['news'] = df['news'].progress_apply(text_clean)
    df = df.drop(['title','text'], axis=1)
    return df

In [8]:
df = pre_process()

HBox(children=(FloatProgress(value=0.0, max=83.0), HTML(value='')))


Normalized
Cleaning


HBox(children=(FloatProgress(value=0.0, max=72660.0), HTML(value='')))




In [0]:
x_train,x_test,y_train,y_test = train_test_split(df['news'].values, df['fake'].values, test_size=0.2, random_state=2020)

In [0]:
def pipeline_model(pipe, x_train, y_train, x_test, y_test):
    model = pipe.fit(x_train, y_train)
    prediction = model.predict(x_test)
    print("accuracy: {}".format(round(accuracy_score(y_test, prediction)*100,2)))

The text is first converted to a count vectorizer then TFIDF (Term Frequency Inverse Document Frequency)

<h3>Logistic Regression</h3>

In [20]:
pipeline_model(Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())]),
                   x_train, y_train, x_test, y_test)

accuracy: 93.48


<h3>Linear Support Vector Classification</h3>

In [21]:
pipeline_model(Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LinearSVC())]),
                  x_train, y_train, x_test, y_test)

accuracy: 95.08


Using Grid Search CV to get best parameters fot Linear SVC

In [22]:
from sklearn.model_selection import GridSearchCV

parameters = {'multi_class': ['ovr','crammer_singer'], 
              'tol': [1e-3, 1e-4],
              #'dual': [True,False], 
              'loss': ['hinge','squared_hinge'],
              'penalty':['l1','l2'], 
              'C': [1, 10]} #, 00, 1000]}

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', GridSearchCV(LinearSVC(),parameters, refit = True, verbose=2))])

model = pipe.fit(x_train,y_train)
model.named_steps['model'].best_params_

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001 .........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001 .........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001 .........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001 .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=True

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001 .........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.1s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.1s
[CV] C=1, loss=hinge, multi_cl



[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001, total=   9.4s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001, total=   9.2s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001, total=   7.7s
[CV] C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001 ........
[CV]  C=1, loss=hinge, multi_class=ovr, penalty=l2, tol=0.0001, total=   8.6s
[CV] C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001 
[CV]  C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001, total=  18.7s
[CV] C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001 
[CV]  C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001, total=  12.2s
[CV] C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001 
[CV]  C=1, loss=hinge, multi_class=crammer_singer, penalty=l1, tol=0.001, total=

ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True



[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001 
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001 
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001 
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001 
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001 
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l1, tol=0.0001, total=   0.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr, penalty=l2, tol=0.001 .
[CV]  C=1, loss=squared_hinge, multi_class=ovr, penalty=l2,

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 98.8min finished


AttributeError: ignored

In [32]:
model.named_steps['model'].best_params_

{'C': 1,
 'loss': 'squared_hinge',
 'multi_class': 'ovr',
 'penalty': 'l2',
 'tol': 0.001}

Manually finding Best parameters for XG Boost (Ideally should have used Gridsearch CV but it took lot of time)

In [0]:
from xgboost import XGBClassifier

for d in [1,5,10,20,50]:
    for n in [1,5,10,20,50]:
        for l in [0.001,0.01,0.05,0.1,0.5,1,2]:
            print("d:{},n:{},l:{}".format(d,n,l))
            pipeline_model(Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('model', XGBClassifier(loss = 'deviance',
                                              learning_rate = l,
                                                   n_estimators = n,
                                                   max_depth = d,
                                                   random_state=2020))]),
                        x_train, y_train, x_test, y_test)

d:1,n:1,l:0.001
accuracy: 64.22
d:1,n:1,l:0.01
accuracy: 64.22
d:1,n:1,l:0.05
accuracy: 64.22
d:1,n:1,l:0.1
accuracy: 64.22
d:1,n:1,l:0.5
accuracy: 64.22
d:1,n:1,l:1
accuracy: 64.22
d:1,n:1,l:2
accuracy: 64.22
d:1,n:5,l:0.001
accuracy: 64.22
d:1,n:5,l:0.01
accuracy: 64.4
d:1,n:5,l:0.05
accuracy: 70.92
d:1,n:5,l:0.1
accuracy: 71.04
d:1,n:5,l:0.5
accuracy: 75.87
d:1,n:5,l:1
accuracy: 77.45
d:1,n:5,l:2
accuracy: 61.65
d:1,n:10,l:0.001
accuracy: 64.4
d:1,n:10,l:0.01
accuracy: 64.4
d:1,n:10,l:0.05
accuracy: 71.04
d:1,n:10,l:0.1
accuracy: 76.16
d:1,n:10,l:0.5
accuracy: 79.0
d:1,n:10,l:1
accuracy: 80.37
d:1,n:10,l:2
accuracy: 50.06
d:1,n:20,l:0.001
accuracy: 64.4
d:1,n:20,l:0.01
accuracy: 70.92
d:1,n:20,l:0.05
accuracy: 76.2
d:1,n:20,l:0.1
accuracy: 76.63
d:1,n:20,l:0.5
accuracy: 84.37
d:1,n:20,l:1
accuracy: 85.47
d:1,n:20,l:2
accuracy: 50.09
d:1,n:50,l:0.001
accuracy: 64.4
d:1,n:50,l:0.01
accuracy: 71.04
d:1,n:50,l:0.05
accuracy: 76.79
d:1,n:50,l:0.1
accuracy: 79.59
d:1,n:50,l:0.5
accuracy: 

Best Accuracy Compared to other methods hence further tired using XGBoost with tokenizing texts in XGBoost.ipynb

In [26]:
from xgboost import XGBClassifier
d, n, l = 10, 50, 0.5
pipeline_model(Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('model', XGBClassifier(loss = 'deviance',
                                                learning_rate = l,
                                                n_estimators = n,
                                                max_depth = d,
                                                random_state=2020))]),
                        x_train, y_train, x_test, y_test)

accuracy: 95.27


Tried using LightGBM, Light Gradient Boosting Machine

In [33]:
import lightgbm as lgbm


vec = CountVectorizer()
Xtrain = vec.fit_transform(x_train)
Xtest = vec.fit_transform(x_test)
tfidf = TfidfTransformer()
Xtrain = tfidf.fit_transform(Xtrain)
Xtest = tfidf.fit_transform(Xtest)


train_d = lgbm.Dataset(Xtrain, y_train)
eval_d = lgbm.Dataset(Xtest, y_test)

parameters = {
    'objective': 'binary',
    'metric':['accuracy','auc'],
    'is_unbalance': 'true',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'seed' : 76
}

m = lgbm.train(parameters,
                train_d,
                num_boost_round=40,
                valid_sets=eval_d,
                early_stopping_rounds=40)#20)

[1]	valid_0's auc: 0.500389
Training until validation scores don't improve for 40 rounds.
[2]	valid_0's auc: 0.502419
[3]	valid_0's auc: 0.502351
[4]	valid_0's auc: 0.50235
[5]	valid_0's auc: 0.501801
[6]	valid_0's auc: 0.501733
[7]	valid_0's auc: 0.501734
[8]	valid_0's auc: 0.50187
[9]	valid_0's auc: 0.501183
[10]	valid_0's auc: 0.501049
[11]	valid_0's auc: 0.501049
[12]	valid_0's auc: 0.500844
[13]	valid_0's auc: 0.500844
[14]	valid_0's auc: 0.500844
[15]	valid_0's auc: 0.500777
[16]	valid_0's auc: 0.500778
[17]	valid_0's auc: 0.50084
[18]	valid_0's auc: 0.500905
[19]	valid_0's auc: 0.500905
[20]	valid_0's auc: 0.500705
[21]	valid_0's auc: 0.500637
[22]	valid_0's auc: 0.500633
[23]	valid_0's auc: 0.50063
[24]	valid_0's auc: 0.500619
[25]	valid_0's auc: 0.503528
[26]	valid_0's auc: 0.503635
[27]	valid_0's auc: 0.504359
[28]	valid_0's auc: 0.504352
[29]	valid_0's auc: 0.504344
[30]	valid_0's auc: 0.508907
[31]	valid_0's auc: 0.513253
[32]	valid_0's auc: 0.51278
[33]	valid_0's auc: 0.51