In [1]:
# Make all neede imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Read csv-file and put data into DataFrame
df = pd.read_csv(r'data/news.csv')

df.shape
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# First, check data for nulls
df.isnull().values.any()

False

### There is no missed values. Good!

In [4]:
# Let's see info about DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
# Get labels from DataFrame
labels = df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
# Get texts from DataFrame
text = df.text
text.head()

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [7]:
# Ok! Let's split the dataset to training and test sets
x_train, x_test, y_train, y_test = train_test_split(text, labels, test_size=0.3, random_state=7)
print('x train\n', x_train.head())
print('y train\n', y_train.head())
print('x test\n', x_test.head())
print('y test\n', y_test.head())

x train
 4274    Home / Be The Change / Government Corruption /...
4310    At least a half-dozen attendees shoved and tac...
2050    As soon as Rep. Kevin McCarthy (R-Calif.) shoc...
4410    NOT ON THE SHORT LIST\n\nFormer Vice President...
3106    Eric Liu is the founder of Citizen University ...
Name: text, dtype: object
y train
 4274    FAKE
4310    REAL
2050    REAL
4410    REAL
3106    REAL
Name: label, dtype: object
x test
 3534    A day after the candidates squared off in a fi...
6265    VIDEO : FBI SOURCES SAY INDICTMENT LIKELY FOR ...
3123    It's debate season, where social media has bro...
3940    Mitch McConnell has decided to wager the Repub...
2856    Donald Trump, the actual Republican candidate ...
Name: text, dtype: object
y test
 3534    REAL
6265    FAKE
3123    REAL
3940    REAL
2856    REAL
Name: label, dtype: object


In [8]:
# Let’s initialize a TfidfVectorizer with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [9]:
# Now we'll initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [10]:
# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

print(f'Accuracy: {round(score * 100, 2)}%')

Accuracy: 92.69%


### We got 92.69 % of accuracy with this model, wich is pretty good

In [11]:
# And now, we'll look at confusion matrix to get an idea of the number of false and true negatives and positives

conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
print('Confusin matrix:\n', conf_matrix)
tn, fp, fn, tp = conf_matrix.ravel()
print('True negative: ', tn)
print('False positive: ', fp)
print('False negative: ', fn)
print('True positive: ', tp)

Confusin matrix:
 [[906  68]
 [ 71 856]]
True negative:  906
False positive:  68
False negative:  71
True positive:  856


### So with this model, we have 902 true negatives, 853 true positives, 74 false negatives, and 72 false positives