In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import feature_extraction, model_selection, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import glob 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import random
from string import punctuation
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
stop.update(list(punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mohnishdevadiga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 250

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [13]:
def get_datasets(shuffle=False,processed=False):
    df = pd.DataFrame()
    path = 'Datasets/*/*_*.csv'
    if processed:
        path = 'processed_dataset/*.csv'
    for file in tqdm(glob.glob(path)):
        df = df.append(pd.read_csv(file), ignore_index=True)
    if shuffle:
    	df = df.reindex(np.random.permutation(df.index)).reset_index(drop=True)
    return df

In [4]:
def normalize(df,difference=300):
    df_list = [df[df['fake']==0],df[df['fake']==1]] # [fact,fake]
    lst = [len(df_list[0]),len(df_list[1])]
    if not bool(lst.index(min(lst))):
        """ if fake is larger then swap """
        df_list[0], df_list[1] = df_list[1], df_list[0]
    size = len(df_list[0]) - len(df_list[1]) + difference
    to_delete = random.sample(range(0, len(df_list[0])), size)
    df_list[0] = df_list[0].drop(df_list[0].index[to_delete])
    return df_list[0].append(df_list[1], ignore_index=True).sample(frac=1)

In [5]:
def text_clean(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = " ".join(x for x in word_tokenize(str(text)) if x.strip().lower() not in stop)
    text = re.sub(r'\d+', '', text)
    text = " ".join(lemmatizer.lemmatize(x.lower()) for x in text.split())
    return(text)

In [6]:
def pre_process(norm=True):
    if norm:
        df = normalize(get_datasets())
        print("Normalized")
    else:
        df = get_datasets(shuffle=True)
    df = df.replace(np.nan, '', regex=True)
    df['news'] = df['title'].str.cat(df['text'],sep=" ")
    print("Cleaning")
    df['news'] = df['news'].progress_apply(text_clean)
    df = df.drop(['title','text'], axis=1)
    return df

In [7]:
def format_data(df, train=True, tokenizer=None):
    x = df["news"].values
    if train:
        y = df['fake'].values
    if not tokenizer:
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\'', lower=True)
        tokenizer.fit_on_texts(x)
        print('Found %s unique tokens.' % len(tokenizer.word_index))
    x = tokenizer.texts_to_sequences(x)
    x = pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)
    if not train:
        return x, tokenizer
    return x, y, tokenizer

In [8]:
df = pre_process(norm=False)
df.head()

HBox(children=(FloatProgress(value=0.0, max=83.0), HTML(value='')))


Cleaning


HBox(children=(FloatProgress(value=0.0, max=209367.0), HTML(value='')))




Unnamed: 0,fake,news
0,0,germ live decade distilled water kill human ho...
1,0,u.s. tax avoidance clampdown potential headach...
2,1,nunes admits secretly wh right magically found...
3,0,actor life -square-foot apartment six year ago...
4,0,geek think rotten tomato mean superhero movie ...


Saving the processed file into chunks

In [12]:
i=0
chunk_df = np.array_split(df, 30)
for chunk in chunk_df:
    chunk.to_csv('processed_dataset/{}.csv'.format(i),index=False)
    i+=1

In [29]:
X, y, tokenizer = format_data(df)

Found 410973 unique tokens.


In [30]:
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [107]:
from xgboost import XGBClassifier
d, n, l = 10, 50, 0.5
xgb_clf = xgb.XGBClassifier(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xgb_clf.fit(x_train,y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='reg:squarederror', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

Accuracy and Classifier Report

In [131]:
print(accuracy_score(y_test, xgb_clf.predict(x_test)))
print(classification_report(y_test, xgb_clf.predict(x_test)))

0.8402827530209677
              precision    recall  f1-score   support

           0       0.84      0.99      0.91     34764
           1       0.74      0.09      0.16      7110

    accuracy                           0.84     41874
   macro avg       0.79      0.54      0.54     41874
weighted avg       0.83      0.84      0.78     41874

