<a href="https://colab.research.google.com/github/HasnainTariq1/Real-vs-Fake_News_Classifier/blob/main/Real_vs_Fake_News_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from joblib import Parallel, delayed
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Ensure stopwords are downloaded
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# Load the dataset
df = pd.read_csv('/content/train.csv')

In [35]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [36]:
df=df.dropna()
df.reset_index(inplace=True)

In [37]:
# Drop the 'label' column from the DataFrame to create the feature set (X)
X=df.drop('label',axis=1)

In [38]:
# Combine the 'title', 'text', and 'author' columns into a single column called 'content'.
X['content'] = df['title'] + ' ' + df['text'] + ' ' + df['author']

In [39]:
X.head()

Unnamed: 0,index,id,title,author,text,content
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...


In [40]:


## Initialize stopwords and the Porter Stemmer
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Define a function to clean and preprocess text
def clean_text(text):
    # Remove special characters and digits, keeping only alphabets
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text into words
    text = text.split()
    # Remove stopwords and apply stemming
    text = [ps.stem(word) for word in text if word not in stop_words]
    # Join the words back into a single string
    return ' '.join(text)



# Apply the cleaning function in parallel across the 'content' column
# Parallel processing improves efficiency for large datasets
X['content'] = Parallel(n_jobs=-1)(delayed(clean_text)(text) for text in X['content'])


In [41]:
X = X['content']  # Features (preprocessed text)
y = df['label']  # Target (numerical labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Fit the vectorizer to the training data (X_train) and transform it into a TF-IDF feature matrix.
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data (X_test) into a TF-IDF feature matrix.
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [43]:

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.954334153677878
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      2082
           1       0.96      0.94      0.95      1575

    accuracy                           0.95      3657
   macro avg       0.95      0.95      0.95      3657
weighted avg       0.95      0.95      0.95      3657



In [44]:

# Function to predict if the news is fake or real
def predict_news(text):
    # Preprocess the text (cleaning it)
    cleaned_text = clean_text(text)

    # Transform the cleaned text using the loaded vectorizer
    transformed_text = tfidf_vectorizer.transform([cleaned_text])

    # Predict using the loaded model
    prediction = model.predict(transformed_text)

    # Convert numerical prediction to label
    return "Real" if prediction[0] == 1 else "Fake"

# Input from the user
user_input = input("Enter the news text or article to classify: ")

# Make a prediction
result = predict_news(user_input)
print(f"The news is predicted to be: {result}")

Enter the news text or article to classify: Have you seen that pro-Hillary TV ad of disgraced Gen. John Allen?  Nauseating.  You should know that in 2011, Allen, then a 4-star general in the U.S. Marine Corps, was nominated to be NATO’s Supreme Allied Commander, Europe, pending confirmation by the Senate. On November 13, 2012, Defense Secretary Leon Panetta suspended Allen’s confirmation hearing, pending investigations into the general’s “inappropriate communication” with a woman named Jill Kelley. Gen. John Allen (l); Jill Kelley (r)  As part of the fallout of the Gen. David Petraeus -Paula Broadwell affair, the FBI uncovered 20,000 to 30,000 pages of correspondence — mostly email — between Allen and Kelley from 2010 to 2012. Reportedly, their correspondence was “flirtatious” and “inappropriate” as Allen and Kelley were both married at the time, but not to each other.  Seriously, how can a 4-star general even have so much free time as to write 20,000 to 30,000 emails in the space of t