In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YASEEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Add a column to indicate whether the news is fake or true
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # True news

# Combine the datasets
df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# Shuffle the combined dataset
df = df.sample(frac=1).reset_index(drop=True)

# Explore the datasets
print(df.head())
print(df.info())
print(df['label'].value_counts())


                                               title  \
0  Russia builds bridge to move troops across Syr...   
1  North Korea pledges 'powerful counter measures...   
2  ANOTHER CLINTON CASUALTY? Sister Of Woman Who ...   
3  Trump calls Ford building plant in Mexico 'an ...   
4   President Obama Just Sent The Climate Change ...   

                                                text       subject  \
0  (Reuters) - The Russian military has built a b...     worldnews   
1  VLADIVOSTOK, Russia (Reuters) - North Korea on...     worldnews   
2  As more and more women line up to tell their s...      politics   
3  DETROIT (Reuters) - U.S. Republican presidenti...  politicsNews   
4  On Friday, President Obama gave climate change...          News   

                  date  label  
0  September 26, 2017       1  
1   September 7, 2017       1  
2         Nov 25, 2017      0  
3       April 5, 2016       1  
4     January 15, 2016      0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 

In [3]:
# Define a function to clean the text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

# Apply the function to the text data
df['text'] = df['text'].apply(clean_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Explore the cleaned data
print(df.head())


                                               title  \
0  Russia builds bridge to move troops across Syr...   
1  North Korea pledges 'powerful counter measures...   
2  ANOTHER CLINTON CASUALTY? Sister Of Woman Who ...   
3  Trump calls Ford building plant in Mexico 'an ...   
4   President Obama Just Sent The Climate Change ...   

                                                text       subject  \
0  reuters russian military built bridge across e...     worldnews   
1  vladivostok russia reuters north korea thursda...     worldnews   
2  women line tell stories sex bill clinton conse...      politics   
3  detroit reuters u republican presidential fron...  politicsNews   
4  friday president obama gave climate change act...          News   

                  date  label  
0  September 26, 2017       1  
1   September 7, 2017       1  
2         Nov 25, 2017      0  
3       April 5, 2016       1  
4     January 15, 2016      0  


In [4]:
# Split the data into features (X) and labels (y)
X = df['text']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Explore the shapes of the splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(35918,) (8980,) (35918,) (8980,)


In [5]:
# Use TfidfVectorizer to convert text data to numerical data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Explore the shape of the vectorized data
print(X_train_tfidf.shape, X_test_tfidf.shape)


(35918, 5000) (8980, 5000)


In [6]:
# Use TfidfVectorizer to convert text data to numerical data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Explore the shape of the vectorized data
print(X_train_tfidf.shape, X_test_tfidf.shape)


(35918, 5000) (8980, 5000)


In [8]:
# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)


In [9]:
# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9888641425389755
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4672
           1       0.99      0.99      0.99      4308

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

[[4624   48]
 [  52 4256]]


In [10]:
import joblib

# Save the model
joblib.dump(model, 'news_classifier_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [14]:
# Load the model
model = joblib.load('news_classifier_model.pkl')

# Load the vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Define a function to clean and preprocess the input text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = clean_text(text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to predict whether news is fake or true
def predict_news(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Vectorize the text
    text_tfidf = vectorizer.transform([preprocessed_text])
    
    # Make a prediction
    prediction = model.predict(text_tfidf)
    
    # Interpret the prediction
    if prediction[0] == 0:
        return 'Fake'
    else:
        return 'True'

# Prompt the user for input
user_input = input("Enter the news article text: ")
prediction = predict_news(user_input)
print(f'The news article is classified as: {prediction}')


Enter the news article text:  he following statements were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own. Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : - Vanity Fair, which looks like it is on its last legs, is bending over backwards in apologizing for the minor hit they took at Crooked H. Anna Wintour, who was all set to be Amb to Court of St James’s & a big fundraiser for CH, is beside herself in grief & begging for forgiveness! [1024 EST] -- Source link: (bit.ly/2jBh4LU) (bit.ly/2jpEXYR) ",politicsNews,"December 28, 2017 " "Trump on Twitter (Dec 27) - Trump, Iraq, Syria","The following statements were posted to the verified Twitter accounts of U.S. President Donald Trump, @realDonaldTrump and @POTUS.  The opinions expressed are his own. Reuters has not edited the statements or confirmed their accuracy.  @realDonaldTrump : - “On 1/20 - the day Trump was 

The news article is classified as: True
