In [None]:
import csv

# Open and read the fake news CSV files
fake_file_path = 'articles\\fake_news.csv'
real_file_path = 'articles\\real_news.csv'

fake_label = 0
real_label = 1

labeled_fake_articles = []
with open(fake_file_path, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    headers = next(csv_file)
    for row in csv_file:

        title = row[0]
        text = row[1]
        subject = row[2]
        date = row[3]
        label = fake_label # Assign labels to each article (0 for fake)

        article = (title, text, subject, date, label)

        labeled_fake_articles.append(article)


labeled_real_articles = []
with open(real_file_path, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    headers = next(csv_file)
    for row in csv_file:

        title = row[0]
        text = row[1]
        subject = row[2]
        date = row[3]
        label = real_label # Assign labels to each article (1 for real)

        article = (title, text, subject, date, label)

        labeled_real_articles.append(article)

        

combined_articles = labeled_real_articles + labeled_fake_articles



In [None]:

import random
random.shuffle(combined_articles)

# Display the first few rows of the combined dataset
for i in range(5):
    print(combined_articles[i])

# Check the distribution of labels
fake_count = sum(1 for article in combined_articles if article[4] == fake_label)
real_count = sum(1 for article in combined_articles if article[4] == real_label)
print("Fake Articles:", fake_count)
print("Real Articles:", real_count)

In [None]:
import re

def preprocess(text):

    # Define a list of known abbreviations
    abbreviations = ["U.S.", "Dr.", "etc.", "e.g.", "i.e."]

    
   # Separate words that are joined together (e.g., leftNews)
    tokens = []
    for word in text.split():
       # Check if the word is an abbreviation
        found_abbreviation = False
        for abbr in abbreviations:
            if abbr in word:
                # Remove punctuation from the abbreviation
                abbr_without_punctuation = ''.join(char for char in abbr if char.isalnum())

                # Tokenize the text
                tokens.append(abbr_without_punctuation)

                found_abbreviation = True
                break
        
        if not found_abbreviation:
            # Pattern to handle abbreviations and words with punctuation
            tokens.extend(re.findall(r'[A-Z]{2,}(?:\.[A-Z]\.)?(?:[,.!?]|$)|[A-Z]?[a-z]+|[A-Z]+|[a-z]+(?=[A-Z])', word))
      
    # Remove stopwords
    stopwords = ["the", "and", "is", "it", "in", "to", "of", "an", "a"]
    tokens_without_stopwords = [word for word in tokens if word not in stopwords]
   
    # Join tokens back into a string and convert text to lowercase
    preprocessed_text = ' '.join(tokens_without_stopwords).lower()

    text_without_punctuation = re.sub(r'[^\w\s]', '', preprocessed_text)

    return text_without_punctuation


# Preprocess the title and text data in combined_articles
preprocessed_articles = []
for article in combined_articles:
    title, text, subject, date, label = article
    preprocessed_title = preprocess(title)
    preprocessed_text = preprocess(text)
    preprocessed_subject = preprocess(subject)
    preprocessed_article = (preprocessed_title, preprocessed_text, preprocessed_subject, date, label)
    preprocessed_articles.append(preprocessed_article)

# Example of preprocessed article
print("Original Title:", combined_articles[0][0])
print("Original Text:", combined_articles[0][1])
print("Original Subject:", combined_articles[0][2])
print("Preprocessed Title:", preprocessed_articles[0][0])
print("Preprocessed Text:", preprocessed_articles[0][1])
print("Preprocessed Subject:", preprocessed_articles[0][2])


# # Example usage:
# text = r'''This is an example"...(sentence of left-news and leftNews, worldnews, U.S., and Dr., 3, 800, 2019, 21questions with punctuation, numbers and abbreviations like e.g. and i.e. with punctuation and stopwords!'''
# preprocessed_text = preprocess(text)
# print("Original Text:", text)
# print("Preprocessed Text:", preprocessed_text)

In [None]:
# CSV file path
preprocessed_csv_file = 'preprocessed_articles.csv'

# Open the CSV file in write mode
with open(preprocessed_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['preprocessed_title', 'preprocessed_text', 'preprocessed_subject', 'date', 'label'])
    
    # Write each preprocessed article to the CSV file
    for article in preprocessed_articles:
        csv_writer.writerow(article)

In [144]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from textblob import TextBlob

# Read the preprocessed CSV file
df = pd.read_csv(preprocessed_csv_file)

# Drop rows with missing values in any column
df.dropna(inplace=True)


# Example function to analyze sentiment
def analyze_sentiment(text):
    blob = TextBlob(text)
    # Get polarity score (-1 to +1) where < 0 indicates negative sentiment, > 0 indicates positive sentiment
    polarity = blob.sentiment.polarity
    # Classify sentiment based on polarity score
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Assuming 'df' is your DataFrame containing the preprocessed articles
# Apply sentiment analysis to each article's text and add sentiment features
df['sentiment'] = df['preprocessed_text'].apply(analyze_sentiment)

# Now 'df' contains a new column 'sentiment' indicating the sentiment (positive, negative, or neutral) of each article


# Columns to include as features
text = df['preprocessed_text'] 
title = df['preprocessed_title']  
subject = df['preprocessed_subject'] 
label = df['label'] 
sentiment = df['sentiment']




In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# # # Vectorize text data using TF-IDF
# vectorizer = TfidfVectorizer(max_features=40000)  # Adjust max_features as needed

# # Initialize the TfidfVectorizer with n-gram range
# #vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # This includes unigrams and bigrams

# # Fit and transform the data
# x = vectorizer.fit_transform(X)

# # Split data into training and testing sets
# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize each text feature separately
vectorizer_text = TfidfVectorizer(max_features=40000)
x_text = vectorizer_text.fit_transform(text)

vectorizer_title = TfidfVectorizer(max_features=40000)
x_title = vectorizer_title.fit_transform(title)

vectorizer_subject = TfidfVectorizer(max_features=40000)
x_subject = vectorizer_subject.fit_transform(subject)

vectorizer_sentiment = TfidfVectorizer(max_features=40000)
x_sentiment = vectorizer_sentiment.fit_transform(sentiment)

# Concatenate the resulting matrices
#X = hstack([x_text, x_title, x_subject])
X = hstack([x_title, x_sentiment])
#X = x_title  # Include relevant features
y = label  # Target variable



# Split the data into training and testing sets (80% train, 20% test)
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.2, random_state=42)

# Now, x_train, x_test contain the preprocessed and vectorized text data,
# and y_train, y_test contain the corresponding labels

In [150]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
lr_model = LogisticRegression()

In [152]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed


In [153]:
model = rf_model

# Train the model
model.fit(x_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(x_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:\n", val_report)

# Make predictions on the test set
y_test_pred = model.predict(x_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", test_report)

# # Make predictions on the testing set
# y_pred = model.predict(x_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print("Accuracy:", accuracy)
# print("Classification Report:\n", report)

Validation Accuracy: 0.9699279966116052
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      3726
           1       0.96      0.98      0.97      3357

    accuracy                           0.97      7083
   macro avg       0.97      0.97      0.97      7083
weighted avg       0.97      0.97      0.97      7083

Test Accuracy: 0.9699570815450643
Test Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      4494
           1       0.97      0.97      0.97      4360

    accuracy                           0.97      8854
   macro avg       0.97      0.97      0.97      8854
weighted avg       0.97      0.97      0.97      8854

