# Fake news classification

This dataset was taken from Kaggle web page. 

In [1]:
#Import modules
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [2]:
#Set seed
np.random.seed(5)

#Loading true news
truedf = pd.read_csv('../DataSets/FakeNewsClassifier/True.csv')
#Loading fake news
fakedf = pd.read_csv('../DataSets/FakeNewsClassifier/Fake.csv')

print(truedf.head())
print('------')
print(fakedf.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   
------
                                               title  \
0   Donald Trump Sends Out Embarras

In [3]:
#Checking for nulls values
print(truedf.isnull().sum())
print('---------------')
print(fakedf.isnull().sum())

title      0
text       0
subject    0
date       0
dtype: int64
---------------
title      0
text       0
subject    0
date       0
dtype: int64


In [4]:
#Creating target columns
truedf['is_fake'] = 0
fakedf['is_fake'] = 1

In [5]:
#concat dataframes
df = pd.concat([truedf,fakedf],ignore_index=True)


print(df.head())
print('---------')
print(df.is_fake.value_counts())
print(df.info())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  is_fake  
0  December 31, 2017         0  
1  December 29, 2017         0  
2  December 31, 2017         0  
3  December 30, 2017         0  
4  December 29, 2017         0  
---------
1    23481
0    21417
Name: is_fak

## Decision Tree

In [6]:
#Decision tree parameter
params = {'max_depth':[5,10,15]}
tree = DecisionTreeClassifier(random_state=42)

#Pipe line to process text data
pipe = Pipeline([('countvector', CountVectorizer()),
                ('tfid', TfidfTransformer())])

In [8]:
df['full_text'] = df.title + df.text

#process the full text
X = pipe.fit_transform(df.full_text)

y = df['is_fake']



In [9]:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,  random_state=42)

grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=params)

In [10]:
#model fit usnig grid search
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [5, 10, 15]}, pre_dispatch='2*n_jobs',


In [11]:
#selecting best estimator
model = grid.best_estimator_

In [12]:
#Making predictions
y_pred = model.predict(X_test)

In [15]:
print(accuracy_score(y_test,y_pred))

0.9951002227171493


In [17]:
print(confusion_matrix(y_test,y_pred))
print()
print(classification_report(y_test,y_pred))

[[6446   28]
 [  38 6958]]

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6474
           1       1.00      0.99      1.00      6996

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

pipe = Pipeline([('countvector', CountVectorizer(analyzer=stemmed_words)),
                ('tfid', TfidfTransformer())])

In [None]:
X = pipe.fit_transform(text)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,  random_state=42)

grid.fit(X_train, y_train)

In [None]:
model = grid.best_estimator_

y_pred = model.predict(X_test)

accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)