## Fake News Detection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [2]:
path = "/home/liben/Liben/Desktop/5th year 1st semester/big data modelling/final_project/archive/"
true_df = pd.read_csv(path + 'True.csv')
fake_df = pd.read_csv(path + 'Fake.csv')

In [3]:
true_df['label'] = 0

In [4]:
fake_df['label'] = 1

In [5]:
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [6]:
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [7]:
true_df = true_df[['text','label']]
fake_df = fake_df[['text','label']]

In [8]:
dataset = pd.concat([true_df , fake_df])

In [9]:
dataset.shape

(44898, 2)

### Null values

In [10]:
dataset.isnull().sum() # no null values

text     0
label    0
dtype: int64

### Balanced or Unbalanced dataset

In [11]:
dataset['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [12]:
true_df.shape # true news

(21417, 2)

In [13]:
fake_df.shape # fake news

(23481, 2)

### Shuffle or Resample

In [14]:
dataset = dataset.sample(frac = 1)

In [15]:
dataset.head()

Unnamed: 0,text,label
8769,(Reuters) - A Tennessee state lawmaker was acc...,0
1224,Donald Trump has brought more corruption into ...,1
20524,"MIAMI/TAMPA, Fla. (Reuters) - Walter Hodgdon s...",0
19898,It s amazing these leftist hacks haven t been ...,1
8102,"WINSTON-SALEM, N.C. (Reuters) - The mayor of C...",0


In [16]:
dataset.reset_index(inplace=True)
dataset.drop(["index"],axis = 1,inplace = True)

In [17]:
dataset.head()

Unnamed: 0,text,label
0,(Reuters) - A Tennessee state lawmaker was acc...,0
1,Donald Trump has brought more corruption into ...,1
2,"MIAMI/TAMPA, Fla. (Reuters) - Walter Hodgdon s...",0
3,It s amazing these leftist hacks haven t been ...,1
4,"WINSTON-SALEM, N.C. (Reuters) - The mayor of C...",0


### Creating a function to process the texts

In [18]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [19]:
dataset["text"] = dataset["text"].apply(wordopt)

### Splitting Training and Testing

In [20]:
x_train, x_test, y_train, y_test = train_test_split(dataset["text"], dataset["label"], test_size=0.25)

### Convert text to vectors

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [23]:
pred_lr=LR.predict(xv_test)

In [24]:
LR.score(xv_test, y_test)

0.9861915367483296

In [25]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5330
           1       0.99      0.98      0.99      5895

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



### Decision Tree Classification

In [26]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [27]:
pred_dt = DT.predict(xv_test)

In [28]:
DT.score(xv_test, y_test)

0.995456570155902

In [29]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5330
           1       0.99      1.00      1.00      5895

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [None]:
pred_gbc = GBC.predict(xv_test)

In [None]:
GBC.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_gbc))

### Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [36]:
pred_rfc = RFC.predict(xv_test)

In [37]:
RFC.score(xv_test, y_test)

0.9893986636971047

In [38]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5330
           1       0.99      0.99      0.99      5895

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [39]:
import pickle
pickle.dump( GBC, open( "model.pkl", "wb" ) )
pickle.dump( GBC, open( "model.bin", "wb" ) )

pickle.dump( vectorization, open("vectorizer.pkl","wb"))



In [40]:
    # Save the vectorizer
    vec_file = 'vectorizer.pickle'
    pickle.dump(count_vect, open(vec_file, 'wb'))

    # Save the model
    mod_file = 'classification.model'
    pickle.dump(model, open(mod_file, 'wb'))


NameError: name 'count_vect' is not defined

In [16]:
import nltk

In [17]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [18]:
lemmatizer = WordNetLemmatizer()

In [27]:
# nltk.download('stopwords')
stopwords = stopwords.words('english')

In [28]:
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/liben/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
def clean_data(text):
    text = text.lower() 
    row = re.sub('[^a-zA-Z]' , ' ' , text)
    token = row.split() 
    news = [lemmatizer.lemmatize(word) for word in token if not word in stopwords]  
    clean_news = ' '.join(news) 
    
    return clean_news 

In [30]:
dataset['text'] = dataset['text'].apply(lambda x : clean_data(x))

In [31]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [32]:
dataset.head(20)

Unnamed: 0,text,label
10074,bikers trump announced support pro trump atten...,1
15967,bangkok reuters authority thailand southeaster...,0
6711,washington reuters elon musk chairman chief ex...,0
21099,dangerous self centered woman america hillary ...,1
9507,washington reuters white house friday defended...,0
1152,golden state warrior clenched nba champion cle...,1
3492,donald trump may decided russia going america ...,1
12123,bucharest reuters thousand romanian lined stre...,0
19039,sydney reuters second group refugee held austr...,0
7648,washington reuters newly discovered email rela...,0


In [33]:
from sklearn.model_selection import train_test_split
train_X , test_X , train_y , test_y = train_test_split(dataset['text'] , dataset['label'] , test_size = 0.2 ,random_state = 0)

In [34]:
print(train_X.to_frame().text)

2853     washington reuters u president donald trump su...
2834     may shock may trump delegating duty writing ex...
9104     noted individual came toward took picture hear...
6121     appears republican quite sure board donald tru...
19342    berlin frankfurt reuters germany guard last mi...
                               ...                        
2760     expect kid homework nominee top education post...
16141    curious ap reporting story senior u official r...
8323     comedian alison leiby tweeted pretty cute comm...
3053     washington reuters white house spokesman rule ...
20098    berlin reuters ukraine thursday welcomed russi...
Name: text, Length: 35918, dtype: object


In [35]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

count_vect = CountVectorizer(analyzer = "word")

count_vectorizer = count_vect.fit(dataset.text)

train_X_vector = count_vectorizer.transform(train_X)
test_X_vector = count_vectorizer.transform(test_X)

In [36]:
count_vect.get_feature_names()[:10]

['aa',
 'aaa',
 'aaaaaaaand',
 'aaaaackkk',
 'aaaaapkfhk',
 'aaaahhhh',
 'aaaand',
 'aaaarrgh',
 'aaab',
 'aaarf']

In [37]:
from sklearn.linear_model import LogisticRegression

# model = LogisticRegression()
# model.fit(train_X_vector,train_y)

In [38]:
from sklearn.pipeline import Pipeline

trigram_vectorizer = CountVectorizer(analyzer = "word", ngram_range=(1,3))
tf_idf = TfidfTransformer(norm="l2")
# classifier = LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, max_iter=100, multi_class='warn',
#           n_jobs=None, penalty='l2', random_state=None, solver='warn',
#           tol=0.0001, verbose=0, warm_start=False)
classifier = LogisticRegression()

pipeline = Pipeline([
     ('vect', trigram_vectorizer),
     ('tfidf', tf_idf),
     ('clf', classifier),
 ])


In [41]:
pipeline.fit(train_X, train_y)

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])

In [None]:
print(test)

In [42]:
test_predictions = test_predictions = pipeline.predict(test_X)

In [43]:
test_predictions

array([1, 1, 1, ..., 1, 1, 0])

In [44]:
import pickle
pickle.dump( pipeline, open( "pipeline.pkl", "wb" ) )
pickle.dump( pipeline, open( "pipeline.bin", "wb" ) )

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [None]:
#DataFlair - Fit and transform train set, transform test set
tfidf_train=vectorizer.fit_transform(train_X) 
tfidf_test=vectorizer.transform(test_X)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics import accuracy_score,classification_report
clf = MultinomialNB()
clf.fit(tfidf_train, train_y)
predictions  = clf.predict(tfidf_test)

print(classification_report(test_y , predictions))

In [None]:
predictions_train = clf.predict(tfidf_train)
print(classification_report(train_y , predictions_train))

In [None]:
accuracy_score(train_y , predictions_train)

In [None]:
accuracy_score(test_y , predictions)

In [None]:
# Saving the trained model
import pickle

##dump the model into a file
with open("model.bin", 'wb') as f_out:
    pickle.dump(clf, f_out) # write final_model in .bin file
    f_out.close()  # close the file 

In [None]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
vec_train = vectorizer.fit_transform(train_X)

In [None]:
vec_train = vec_train.toarray()

In [None]:
vec_test = vectorizer.transform(test_X).toarray()

In [None]:
train_data = pd.DataFrame(vec_train , columns=vectorizer.get_feature_names())
test_data = pd.DataFrame(vec_test , columns= vectorizer.get_feature_names())

## Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics import accuracy_score,classification_report

In [None]:
clf = MultinomialNB()

In [None]:
clf.fit(train_data, train_y)
predictions  = clf.predict(test_data)

In [None]:
print(classification_report(test_y , predictions))

Now predict on both train set

In [None]:
predictions_train = clf.predict(train_data)
print(classification_report(train_y , predictions_train))

In [None]:
accuracy_score(train_y , predictions_train)

In [None]:
accuracy_score(test_y , predictions)