In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv("fake_or_real_news.csv")

In [3]:
#make a column of 1s and 0s to distinguish easier between true or false
data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1) 

In [4]:
#drop the label column and it dissapears from the dataset
data = data.drop("label", axis = 1)

In [5]:
#X stores the text and y stores the bools
X, y = data['text'], data['fake']

In [1]:
#train_test_split is used to split the dataset into training and testing sets, so 20% of the data will be used for testing and
# 80% will be used for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

NameError: name 'train_test_split' is not defined

In [7]:
#vectorize the X/text data so that the text can be used as input for the machine learning model
vectorizer = TfidfVectorizer(stop_words = "english", max_df = 0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [8]:
#create an instance of the classifier, classifiers are trained on seen data, and then the trained classifier is then used
#to predict the outcome of the unseen data
clf = LinearSVC()
#The classifier learns the relationships between the input features 
#(TF-IDF values in this case) and the target variable (labels).
#Term Frequency: TF of a term or word is the number of times the term appears in a document 
#compared to the total number of words in the document.
#Inverse Document Frequency: IDF of a term reflects the proportion of documents in the corpus that contain the term.
clf.fit(X_train_vectorized, y_train)



In [9]:
#95 % accuracy from 1267 articles
clf.score(X_test_vectorized, y_test)

0.9423835832675612

In [10]:
len(y_test)

1267

In [11]:
#1200 were true articles
len(y_test) * .9479

1200.9893

In [20]:
#how to test text from the fake_or_real_news.csv
with open("news.txt", "w", encoding = "utf-8") as f:
    f.write(X_test.iloc[10])

In [21]:
with open("news.txt", "r", encoding = "utf-8") as f:
    text = f.read()

In [22]:
vectorized_text = vectorizer.transform([text])

In [23]:
clf.predict(vectorized_text)

array([1])

In [24]:
#how to test your own text
with open("news.txt", "r", encoding="utf-8") as file:
    new_text = file.read()

In [25]:
# Vectorize the new text using the same vectorizer
new_text_vectorized = vectorizer.transform([new_text])

In [26]:
# Use the trained classifier to predict the label (0 for REAL, 1 for FAKE)
prediction = clf.predict(new_text_vectorized)

In [27]:
# Print the result
if prediction[0] == 0:
    print("The news in 'news.txt' is classified as REAL.")
else:
    print("The news in 'news.txt' is classified as FAKE.")

The news in 'news.txt' is classified as FAKE.
