# Predicting Fake News using NLP

## Importing Libraries

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Reading Datasets

In [7]:
# dataset 1: news large dataset
true =pd.read_csv('True.csv', delimiter=',')
true['label'] = 1


false =pd.read_csv('Fake.csv', delimiter=',')
false['label'] = 0




In [8]:
data = pd.concat([true, false], axis=0)

In [9]:
data=data.drop(['text','subject','date'], axis=1)

## Text Preprocessing

In [10]:
import re
from nltk.stem.porter import PorterStemmer
X = []
for title in data['title']:
  sentence = re.sub('[^a-zA-Z]', ' ', title)
  sentence = sentence.lower()
  sentence = sentence.split()
  sentence = ' '.join(sentence)
  X.append(sentence)



y = data['label']

## Test/Train Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Text Representations

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
cv = CountVectorizer()
tfidf = TfidfTransformer(norm=None)
X = cv.fit_transform(X).toarray()
X = tfidf.fit_transform(X).toarray()



In [None]:
print(data.head())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200 ,solver='saga')
classifier.fit(X_train, y_train)

## Predict Probabilities

In [None]:
y_pred = classifier.predict_proba(X_test)


## Performance Metrics

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test,classifier.predict(X_test))
print(cm)
accuracy_score(y_test, y_pred)


## Pipeline 

In [None]:
!pip install skl2onnx


In [None]:
!pip install onnxruntime

In [None]:
from sklearn.pipeline import Pipeline
import seaborn as sns

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import Int64TensorType
import onnxruntime as rt


In [None]:
pipeline = Pipeline([('vectorizer', CountVectorizer()),('tfidf' ,TfidfTransformer(norm=None)), ('classifier',LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200 ,solver='saga'))])


In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_test, y_test)


In [None]:
import joblib

joblib.dump(pipeline, './model/pipeline.pkl')


## Pipeline in ONNX Format

In [None]:
from skl2onnx.common.data_types import StringTensorType


In [None]:
initial_type = [('StringTensorType', StringTensorType([None]))]
onx = convert_sklearn(pipeline, initial_types=initial_type)

with open("./model/pipeline.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [None]:
sess = rt.InferenceSession("./model/pipeline.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

pred_onx = sess.run([label_name], {input_name: X_test})[0]

In [None]:
pred_onx


## Model in ONNX Format

In [None]:
initial_type = [('int64_input', Int64TensorType([None,19576]))]
onx = convert_sklearn(classifier, initial_types=initial_type)
with open("./model/classifier.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [None]:
sess = rt.InferenceSession("./model/classifier.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

pred_onx = sess.run([label_name], {input_name: x_test_vector.toarray().astype(np.int64)})[0]

In [None]:
!zip -r model.zip  ./model