In [2]:
## importarea tuturor librariilor necesare pentru analiza datasetului
## pandas e folosit pentru manipularea & analiza datelor
import pandas as pd
## numpy este folosit la operatii matematice cu arrays & matrice
import numpy as np
## sklearn este folosit in predictive data analasys
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
## serializarea/deserializarea datelor + transportul lor intr-o retea
import pickle

In [4]:
## citirea datelor si "vectorizarea" folosind coloanele text & laber
dataset = pd.read_csv ('news.csv')
x = dataset['text' ]
y = dataset['label']

In [5]:
## afisarea primelor intrari din dataset
dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
## afisarea numarului de intrari (rows) & numarului de coloane
dataset.shape

(6335, 4)

In [7]:
## data-preprocessing prin verificarea existentei unor intrari de tip null
dataset.isnull().any()

Unnamed: 0    False
title         False
text          False
label         False
dtype: bool

In [8]:
## impartirea sample-ului in train & test data in raport 80% train set / 20% test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [9]:
## implementarea TfidVectorizer aka Term Frequency Inverse Document Frequency care vectorizeaza fiecare cuvant "important" si ii atribuie un bool
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

In [10]:
## learning algorithm care verifica acuratetea datasetului nostru
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round (score*100,2)}%')

Accuracy: 94.48%


In [12]:
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by a '__'
## The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification).
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
('nbmodel', MultinomialNB())])

In [13]:
## metoda fit e folosita pentru potrivirea modelului
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [14]:
## recalcularea acuratetii datasetului - 82%
score=pipeline.score(x_test,y_test)
print('accuracy',score)

accuracy 0.8279400157853196


In [15]:
## Transform the data, and apply predict with the final estimator.
pred = pipeline.predict(x_test)

In [16]:
## afisarea performance evaluation table-ului 
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.97      0.67      0.79       621
        REAL       0.76      0.98      0.85       646

    accuracy                           0.83      1267
   macro avg       0.86      0.82      0.82      1267
weighted avg       0.86      0.83      0.82      1267



In [17]:
## afisarea confusion matrix-ului: pozitive / fals pozitive + negative / fals negative
print(confusion_matrix(y_test, pred))

[[417 204]
 [ 14 632]]


In [19]:
## crearea fisierului "model.pkl" in file mode write and binary (wb)
with open('model.pkl', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)