In [16]:
import pandas as pd
import numpy as np
import re
import nltk
import swifter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
# from sklearn.naive_bayes import MultinomialNB

# import textwrap
import joblib

In [17]:
news_df=pd.read_csv("./dataset/news_data.csv")

In [18]:
news_df.head()

Unnamed: 0,title,text,subject,date,label,content
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1,BREAKING: GOP Chairman Grassley Has Had Enoug...
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1,Failed GOP Candidates Remembered In Hilarious...
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1,Mike Pence’s New DC Neighbors Are HILARIOUSLY...
3,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",0,California AG pledges to defend birth control ...
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",1,AZ RANCHERS Living On US-Mexico Border Destroy...


In [19]:
news_df.shape

(44898, 6)

In [20]:
news_df.isna().sum()

title      0
text       0
subject    0
date       0
label      0
content    0
dtype: int64

### filling null with empty string

In [21]:
news_df = news_df.fillna(' ')

In [22]:
news_df.isna().sum()

title      0
text       0
subject    0
date       0
label      0
content    0
dtype: int64

In [23]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))  # Load stopwords once
regex = re.compile('[^a-zA-Z]')               # Compile regex once

def stemming(content):
    stemmed_content = regex.sub(' ', content).lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if word not in stop_words]
    return ' '.join(stemmed_content)

In [24]:

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\MOHIT
[nltk_data]     JOSHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
news_df['content'] = news_df['content'].swifter.apply(stemming)

Pandas Apply:   0%|          | 0/44898 [00:00<?, ?it/s]

In [26]:
X=news_df['content'].values
y=news_df['label'].values

In [27]:
X
# checking if X got expected values

array(['break gop chairman grassley enough demand trump jr testimoni donald trump white hous chao tri cover russia problem mount hour refus acknowledg problem surround fake news hoax howev fact bear thing differ seem crack congression public leadership chuck grassley r iowa head senat judiciari committe fed demand donald trump jr former trump campaign manag paul manafort testifi committe regard infam shadi meet donald trump shadi russian lawyer promis dirt democrat presidenti nomine hillari clinton fact inform due well demand send signal team trump notabl fire special counsel robert mueller circumst despit fact seem seem trump white hous lay groundwork speak speak tweet regard grassley warn also anyon think senat grassley rest senat seriou need look warn alreadi given trump jr manafort either follow order serv subpoena forc compli refus held contempt congress carri seriou jail time even cruel craven creatur within gop sick donald trump corrupt scandal ridden white hous angri stage host

In [28]:
vector= TfidfVectorizer()
vector.fit(X)
X=vector.transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42)

In [30]:
X_train.shape

(26938, 89868)

In [31]:
X_test.shape

(17960, 89868)

### SGDClassifier Algo

In [32]:
model = SGDClassifier(loss='log_loss', max_iter=1000)


In [33]:
model.partial_fit(X_train, y_train, classes=[0, 1]) 

In [34]:
train_y_pred = model.predict(X_train)
print("Train accuracy:", accuracy_score(train_y_pred, y_train))

Train accuracy: 0.9846684980325191


In [35]:
test_y_pred = model.predict(X_test)
print("Test accuracy:", accuracy_score(test_y_pred, y_test))

Test accuracy: 0.9791759465478842


### Logistic Regression

In [36]:
# model=LogisticRegression()
# model.fit(X_train,y_train)

In [37]:
# train_y_pred=model.predict(X_train)
# print("train accuracy : ",accuracy_score(train_y_pred,y_train))

In [38]:
# test_y_pred=model.predict(X_test)
# print("test accuracy : ",accuracy_score(test_y_pred,y_test))

In [39]:
# y_proba = model.predict_proba(X_test)[:]
# y_proba 


### Naive Bayes Classifier

In [40]:
# model2=MultinomialNB()
# model2.fit(X_train,y_train)

In [41]:
# y_pred=model2.predict(X_test)

In [42]:
# print("naive bayes accuracy : ",accuracy_score(y_test,y_pred))


In [43]:
# classification_report(y_test, y_pred)

In [44]:
input_data=X_test[10]
prediction=model.predict(input_data)
if prediction[0]==1:
    print("fake news")
else :
    print('real news')

real news


In [45]:
joblib.dump(model,'model/fake_news_model.pkl')

['model/fake_news_model.pkl']

In [46]:
joblib.dump(vector,'model/tfid_vectorizer.pkl')
print('model saved')

model saved
