In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import pickle

In [3]:
df_fake = pd.read_csv(r'Datasets/Fake.csv')
df_true = pd.read_csv(r'Datasets/True.csv')

In [5]:
df_fake["class"] = 0
df_true["class"] = 1

In [6]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [38]:
df_fake_manual_testing = df_fake.tail(10)
for i in range(23480,23470,-1):
    df_fake.drop([i], axis = 0, inplace = True)
df_true_manual_testing = df_true.tail(10)
for i in range(21416,21406,-1):
    df_true.drop([i], axis = 0, inplace = True)

In [39]:
df_fake.shape, df_true.shape

((23471, 5), (21407, 5))

In [42]:
df_fake_manual_testing["class"] = 0
df_true_manual_testing["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [43]:
df_manual_testing = pd.concat([df_fake_manual_testing,df_true_manual_testing], axis = 0)
df_manual_testing.to_csv("manual_testing.csv")

In [45]:
df_marge = pd.concat([df_fake, df_true], axis =0 )
df_marge.shape

(44878, 5)

In [46]:
df_marge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [47]:
df = df_marge.drop(["title", "subject","date"], axis = 1)
df.shape

(44878, 2)

In [48]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [49]:
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,class
11828,"BELGRADE (Reuters) - In a joint sting, Serbian...",1
22727,21st Century Wire says This has been an ongoin...,0
7415,(Reuters) - Three California cities voted for ...,1
23381,Wall Street on ParadeAt the Democratic debate ...,0
19844,,0


In [50]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [51]:
df.head()

Unnamed: 0,text,class
0,"BELGRADE (Reuters) - In a joint sting, Serbian...",1
1,21st Century Wire says This has been an ongoin...,0
2,(Reuters) - Three California cities voted for ...,1
3,Wall Street on ParadeAt the Democratic debate ...,0
4,,0


In [52]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [53]:
df["text"] = df["text"].apply(wordopt)

In [54]:
x = df["text"]
y = df["class"]

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### 1. Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression

In [59]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
pred_lr=LR.predict(xv_test)

In [61]:
round((LR.score(xv_test, y_test)*100),2)

98.61

In [62]:
inp = '''Sri Lanka will not allow other countries to achieve their geopolitical needs by introducing “separatism under the guise of power devolution” in the island nation, President Gotabaya Rajapaksa has said.

“The government does not wish to be associated with the power struggles in the Indian Ocean region by the global giants,” Mr. Rajapaksa said, adding that the sovereignty of Sri Lanka would not be betrayed, a front-page report in the state-run Daily News said on Monday.'''

In [63]:
inp_ser = pd.Series(data = inp)

In [64]:
tfidf_inp_ser=vectorization.transform(inp_ser)
ans=LR.predict(tfidf_inp_ser)

In [65]:
if ans == 1:
    print('REAL')
else:
    print('Fake')

REAL


In [66]:
pickle.dump(LR, open('LR_fake_news.pkl', 'wb'))
pickle.dump(vectorization, open('fake_news_tfidf_vectorizer.pkl', 'wb'))

### 2. Decision Tree Classification

In [71]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [73]:
pred_dt = DT.predict(xv_test)

In [79]:
xv_train.shape, y_train.shape, xv_test.shape, y_test.shape

((33658, 95213), (33658,), (11220, 95213), (11220,))

In [80]:
round((DT.score(xv_test, y_test)*100),2)

99.64

In [81]:
pickle.dump(DT, open('DT_fake_news.pkl', 'wb'))

In [82]:
inp_ser = pd.Series(data = inp)
tfidf_inp_ser=vectorization.transform(inp_ser)
ans=LR.predict(tfidf_inp_ser)
if ans == 1:
    print('REAL')
else:
    print('Fake')

REAL


### 3. Gradient Boosting Classifier

In [83]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#not fitted because of it takes too much time
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [None]:
pred_gbc = GBC.predict(xv_test)

### 4. Passive Aggressive Classifier

In [86]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
new_pac = pac.fit(xv_train,y_train)

In [88]:
pred_pac = pac.predict(xv_test)

In [95]:
round((pac.score(xv_test, y_test)*100),2)

99.48

In [96]:
pickle.dump(pac, open('pac_fake_news.pkl', 'wb'))

### 4. Random Forest Classifier

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [102]:
pred_RFC = pac.predict(xv_test)

In [103]:
round((RFC.score(xv_test, y_test)*100),2)

98.96

In [104]:
pickle.dump(pac, open('RFC_fake_news.pkl', 'wb'))