In [None]:
import numpy as np  # To create and manipulate numpy arrays
import pandas as pd  # To create and manage data using DataFrames
import re  # Regular expressions for text preprocessing
from nltk.corpus import stopwords  # List of common words (e.g., "the", "is") to be removed
from nltk.stem.porter import PorterStemmer  # Stemming: Converting words to their root form (e.g., "running" -> "run")
from sklearn.feature_extraction.text import TfidfVectorizer  # Converts text into numerical vectors for ML
from sklearn.model_selection import train_test_split  # Splits dataset into training and testing sets
from sklearn.linear_model import LogisticRegression  # Logistic Regression model for classification
from sklearn.metrics import accuracy_score  # To evaluate model accuracy

# Download stopwords dataset from NLTK
import nltk
nltk.download('stopwords')

# Display the list of stopwords in English
print(stopwords.words('english'))

# Data Collection and Preprocessing

# Load dataset into a pandas DataFrame
news_dataset = pd.read_csv('/content/train.csv')

# Check the shape of the dataset (number of rows and columns)
print(news_dataset.shape)

# Display the first 5 rows of the dataset
print(news_dataset.head())

# Display the first 20 lines of the dataset for reference
!head -20 /content/train.csv

# Check for missing values in each column
print(news_dataset.isnull().sum())  # Some articles may have missing author names or titles

# Replace null values with empty strings to avoid errors
news_dataset = news_dataset.fillna('')

# Combine 'author' and 'title' columns into a new 'content' column (text input for prediction)
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

# Display the newly created 'content' column
print(news_dataset['content'])

# Separating features (X) and labels (Y)
X = news_dataset.drop(columns='label', axis=1)  # Features (text content)
Y = news_dataset['label']  # Labels (0 for real news, 1 for fake news)

# Display features and labels
print(X)
print(Y)

# Function for text preprocessing using stemming
def stemming(content):
    port_stem = PorterStemmer()  # Initialize PorterStemmer

    # Remove special characters and numbers, keeping only alphabets
    stemmed_content = re.sub(r'[^\w\s]', '', content)

    # Convert text to lowercase
    stemmed_content = stemmed_content.lower()

    # Tokenize: Split text into words
    stemmed_content = stemmed_content.split()

    # Apply stemming and remove stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]

    # Join words back into a string
    return ' '.join(stemmed_content)

# Apply stemming to the 'content' column
news_dataset['content'] = news_dataset['content'].apply(stemming)

# Display preprocessed content
print(news_dataset['content'])

# Convert text into numerical format
X = news_dataset['content'].values  # Features (text data)
Y = news_dataset['label'].values  # Labels (0 = real, 1 = fake)

print(X)
print(Y)

# Convert text data into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()

# TF-IDF (Term Frequency-Inverse Document Frequency)
# - TF: Counts how often a word appears in a document
# - IDF: Reduces importance of words that appear too frequently across all documents
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)  # Display numerical representation of text

# Split dataset into training (80%) and testing (20%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# - test_size=0.2: 20% of the data is used for testing
# - stratify=Y: Ensures class distribution remains the same in train and test sets
# - random_state=2: Ensures reproducibility of the split

# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model using training data
model.fit(X_train, Y_train)

# Predict labels for training data
X_train_prediction = model.predict(X_train)

# Calculate and print accuracy on training data
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of training data:', training_data_accuracy)

# Predict labels for test data
X_test_prediction = model.predict(X_test)

# Calculate and print accuracy on test data
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of test data:', test_data_accuracy)

# Making a predictive system (Fake News Detection)

# Select a new test example (first row from test set)
X_new = X_test[0]

# Predict whether the news is real or fake
prediction = model.predict(X_new)
print(prediction)

# Print result based on prediction
if prediction[0] == 0:
    print('The news is real')
else:
    print('The news is fake')
