**1. Define Problem: Detect fake news based on title and content**

FEATURES 
- Title: Title of the news
- Text: Brief content of the news
- Labels: FAKE or REAL

**2. Import Library**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

**Load Data Set**

In [None]:
data = pd.read_csv("news.csv")
data = data.drop(['Unnamed: 0'], axis=1)
labels = data.label
print("data shape: {}".format(data.shape))
print(data.isna().sum())
data.head(6)

data shape: (6335, 3)
title    0
text     0
label    0
dtype: int64


Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE


In [None]:
data["label"].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

**3. Divide Dataset**

In [None]:
tf = TfidfVectorizer()

In [None]:
title = data.iloc[:,0].values
text = data.iloc[:,1].values
news_title = tf.fit_transform(title).todense()
news_text = tf.fit_transform(text).todense()
news = np.hstack((news_title,news_text))

In [None]:
news.shape

(6335, 77730)

In [None]:
x_train,x_val,y_train,y_val = train_test_split(news, labels, test_size=0.2, random_state=7)

**4. Using TfidfVectorizer to deal with data**

In [None]:
pac = PassiveAggressiveClassifier()
pac.fit(x_train,y_train)

In [None]:
y_pred = pac.predict(x_val)
accuracy_score(y_val,y_pred)

0.9297553275453828

In [None]:
print('accuracy ',accuracy_score(y_val,y_pred))
print('precision ', precision_score(y_val,y_pred,average= 'weighted'))
print('recall ', recall_score(y_val,y_pred,average= 'weighted'))
print("f1", f1_score(y_val,y_pred, average= 'weighted'))
print(classification_report(y_val, y_pred, target_names = ["FAKE","REAL"]))
confusion_matrix(y_val,y_pred, labels=['FAKE','REAL'])

accuracy  0.9297553275453828
precision  0.9298350136208098
recall  0.9297553275453828
f1 0.9297482372742942
              precision    recall  f1-score   support

        FAKE       0.92      0.94      0.93       638
        REAL       0.94      0.92      0.93       629

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



array([[598,  40],
       [ 49, 580]])

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
y_predict = dtc.predict(x_val)

print('accuracy {}% '.format(round(accuracy_score(y_val,y_predict) * 100,2)))

In [None]:
rf = RandomForestClassifier()
rf.fit(tfidf_train,y_train)
y_prediction = rf.predict(tfidf_test)

print('accuracy {}% '.format(round(accuracy_score(y_val,y_prediction) * 100,2)))

accuracy 89.42% 


**5. Using CountVectorizer to deal with data**


In [None]:
X = data.iloc[:,1].values
cv = CountVectorizer(max_features = 5000)
text_cv = cv.fit_transform(X).todense()

In [None]:
text_cv

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 1, 0, 0],
        [0, 1, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_train,X_val,Y_train,Y_val = train_test_split(text_cv, labels, test_size=0.2, random_state=7)

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train,Y_train)



PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=50, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [None]:
Y_pred = pac.predict(X_val)
print('accuracy ',accuracy_score(Y_val,Y_pred))
print('precision ', precision_score(Y_val,Y_pred,average= 'weighted'))
print('recall ', recall_score(Y_val,Y_pred,average= 'weighted'))
print("f1", f1_score(Y_val,Y_pred, average= 'weighted'))
print(classification_report(Y_val, Y_pred, target_names = ["FAKE","REAL"]))
confusion_matrix(Y_val,Y_pred, labels=['FAKE','REAL'])

accuracy  0.9068666140489345
precision  0.9068685809199039
recall  0.9068666140489345
f1 0.9068653375878469
              precision    recall  f1-score   support

        FAKE       0.91      0.91      0.91       638
        REAL       0.91      0.90      0.91       629

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267



array([[580,  58],
       [ 60, 569]])

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)
Y_predict = dtc.predict(X_val)

print('accuracy ',accuracy_score(y_val,Y_predict))
print('precision ', precision_score(y_val,Y_predict,average= 'weighted'))
print('recall ', recall_score(y_val,Y_predict,average= 'weighted'))
print("f1", f1_score(y_val,Y_predict, average= 'weighted'))
print(classification_report(y_val, Y_predict, target_names = ["FAKE","REAL"]))
confusion_matrix(y_val,Y_predict, labels=['FAKE','REAL'])

accuracy  0.8097868981846882
precision  0.8101723642368537
recall  0.8097868981846882
f1 0.8097570356478723
              precision    recall  f1-score   support

        FAKE       0.82      0.79      0.81       638
        REAL       0.80      0.83      0.81       629

    accuracy                           0.81      1267
   macro avg       0.81      0.81      0.81      1267
weighted avg       0.81      0.81      0.81      1267



array([[507, 131],
       [110, 519]])

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)
Y_predict = dtc.predict(X_val)

print('accuracy {}% '.format(round(accuracy_score(Y_val,Y_predict) * 100,2)))

accuracy 80.98% 


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
Y_prediction = rf.predict(X_val)

print('accuracy {}% '.format(round(accuracy_score(Y_val,Y_prediction) * 100,2)))

accuracy 90.06% 
