In [2]:
import csv

# Open and read the fake news CSV files
fake_file_path = 'articles\\fake_news.csv'
real_file_path = 'articles\\real_news.csv'

fake_label = 0
real_label = 1

labeled_fake_articles = []
with open(fake_file_path, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    headers = next(csv_file)
    for row in csv_file:

        title = row[0]
        text = row[1]
        subject = row[2]
        date = row[3]
        label = fake_label # Assign labels to each article (0 for fake)

        article = (title, text, subject, date, label)

        labeled_fake_articles.append(article)


labeled_real_articles = []
with open(real_file_path, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    headers = next(csv_file)
    for row in csv_file:

        title = row[0]
        text = row[1]
        subject = row[2]
        date = row[3]
        label = real_label # Assign labels to each article (1 for real)

        article = (title, text, subject, date, label)

        labeled_real_articles.append(article)

        

combined_articles = labeled_real_articles + labeled_fake_articles



In [3]:

import random
random.shuffle(combined_articles)

# Display the first few rows of the combined dataset
for i in range(5):
    print(combined_articles[i])

# Check the distribution of labels
fake_count = sum(1 for article in combined_articles if article[4] == fake_label)
real_count = sum(1 for article in combined_articles if article[4] == real_label)
print("Fake Articles:", fake_count)
print("Real Articles:", real_count)

("Qatar calls Trump's Jerusalem move 'death sentence for peace'", 'DOHA (Reuters) - Qatar s foreign minister said on Wednesday U.S. President Donald Trump s decision to recognize Jerusalem as the capital of Israel was a death sentence for all who seek peace, Qatari-owned Al Jazeera television reported. Sheikh Mohammed bin Abdulrahman al-Thani called the move  a dangerous escalation . Qatar s foreign ministry said earlier on Twitter that Emir Sheikh Tamim bin Hamad al-Thani had warned of serious implications from the decision in a telephone conversation with Trump. ', 'worldnews', 'December 6, 2017 ', 1)
("Macron says Iran misunderstands France's 'balanced' position in region", 'GOTHENBURG, Sweden (Reuters) - President Emmanuel Macron said on Friday that Iran misunderstood France s  balanced  position in the region, which centered on not taking sides between Sunni and Shi ites, adding that Tehran should be less aggressive in the region. Speaking in Gothenburg, Macron said he wanted Iran

In [46]:
import re

def preprocess(text):

    # Define a list of known abbreviations
    abbreviations = ["U.S.", "Dr.", "etc.", "e.g.", "i.e."]

    
   # Separate words that are joined together (e.g., leftNews)
    tokens = []
    for word in text.split():
       # Check if the word is an abbreviation
        found_abbreviation = False
        for abbr in abbreviations:
            if abbr in word:
                # Remove punctuation from the abbreviation
                abbr_without_punctuation = ''.join(char for char in abbr if char.isalnum())

                # Tokenize the text
                tokens.append(abbr_without_punctuation)

                found_abbreviation = True
                break
        
        if not found_abbreviation:
            # Pattern to handle abbreviations and words with punctuation
            tokens.extend(re.findall(r'[A-Z]{2,}(?:\.[A-Z]\.)?(?:[,.!?]|$)|[A-Z]?[a-z]+|[A-Z]+|[a-z]+(?=[A-Z])', word))
      
    # Remove stopwords
    stopwords = ["the", "and", "is", "it", "in", "to", "of", "an", "a"]
    tokens_without_stopwords = [word for word in tokens if word not in stopwords]
   
    # Join tokens back into a string and convert text to lowercase
    preprocessed_text = ' '.join(tokens_without_stopwords).lower()

    text_without_punctuation = re.sub(r'[^\w\s]', '', preprocessed_text)

    return text_without_punctuation


# Preprocess the title and text data in combined_articles
preprocessed_articles = []
for article in combined_articles:
    title, text, subject, date, label = article
    preprocessed_title = preprocess(title)
    preprocessed_text = preprocess(text)
    preprocessed_subject = preprocess(subject)
    preprocessed_article = (preprocessed_title, preprocessed_text, preprocessed_subject, date, label)
    preprocessed_articles.append(preprocessed_article)

# Example of preprocessed article
print("Original Title:", combined_articles[0][0])
print("Original Text:", combined_articles[0][1])
print("Original Subject:", combined_articles[0][2])
print("Preprocessed Title:", preprocessed_articles[0][0])
print("Preprocessed Text:", preprocessed_articles[0][1])
print("Preprocessed Subject:", preprocessed_articles[0][2])


# # Example usage:
# text = r'''This is an example"...(sentence of left-news and leftNews, worldnews, U.S., and Dr., 3, 800, 2019, 21questions with punctuation, numbers and abbreviations like e.g. and i.e. with punctuation and stopwords!'''
# preprocessed_text = preprocess(text)
# print("Original Text:", text)
# print("Preprocessed Text:", preprocessed_text)

Original Title: Qatar calls Trump's Jerusalem move 'death sentence for peace'
Original Text: DOHA (Reuters) - Qatar s foreign minister said on Wednesday U.S. President Donald Trump s decision to recognize Jerusalem as the capital of Israel was a death sentence for all who seek peace, Qatari-owned Al Jazeera television reported. Sheikh Mohammed bin Abdulrahman al-Thani called the move  a dangerous escalation . Qatar s foreign ministry said earlier on Twitter that Emir Sheikh Tamim bin Hamad al-Thani had warned of serious implications from the decision in a telephone conversation with Trump. 
Original Subject: worldnews
Preprocessed Title: qatar calls trump s jerusalem move death sentence for peace
Preprocessed Text: doha reuters qatar s foreign minister said on wednesday us president donald trump s decision recognize jerusalem as capital israel was death sentence for all who seek peace qatari owned al jazeera television reported sheikh mohammed bin abdulrahman al thani called move dange

In [48]:
# CSV file path
preprocessed_csv_file = 'preprocessed_articles.csv'

# Open the CSV file in write mode
with open(preprocessed_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['preprocessed_title', 'preprocessed_text', 'preprocessed_subject', 'date', 'label'])
    
    # Write each preprocessed article to the CSV file
    for article in preprocessed_articles:
        csv_writer.writerow(article)

In [97]:
import pandas as pd

# Read the preprocessed CSV file
df = pd.read_csv(preprocessed_csv_file)

# Drop rows with missing values in any column
df.dropna(inplace=True)

# Columns to include as features
text = df['preprocessed_text'] 
title = df['preprocessed_title']  
subject = df['preprocessed_subject'] 
label = df['label'] 

# Combine the selected columns into a single DataFrame (x)
#X = pd.concat([text, title, subject], axis=1)
X = text

# Extract the target variable (y)
y = label

# print("Number of samples in text column:", len(text))
# print("Number of samples in title column:", len(title))
# print("Number of samples in subject column:", len(subject))
# print("Number of samples in label column:", len(label))

# print("Shape of x:", X.shape)
# print("Shape of y:", y.shape)

# print("Shape of text:", text.shape)
# print("Shape of title:", title.shape)
# print("Shape of subject:", subject.shape)


# print("Data type of x:", type(X))
# print("Data type of y:", type(y))




# Inspecting the first few rows of the DataFrame to verify the data
print(X.head())
print(y.head())
print(y.value_counts())

# print("Number of fake article:", len(labeled_fake_articles))
# print("Number of real article:", len(labeled_real_articles))

# # Check for missing values or NaNs in x
# missing_values_x = X.isnull().sum()
# print("Missing values in x:")
# print(missing_values_x)

# # Check for missing values or NaNs in y
# missing_values_y = y.isnull().sum()
# print("Missing values in y:")
# print(missing_values_y)




0    doha reuters qatar s foreign minister said on ...
1    gothenburg sweden reuters president emmanuel m...
2    last night nbc hosted commander chief forum ne...
3    washington reuters us house representatives sp...
4    new york reuters puerto rico s governor propos...
Name: preprocessed_text, dtype: object
0    1
1    1
2    0
3    1
4    1
Name: label, dtype: int64
label
0    22851
1    21416
Name: count, dtype: int64


In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=40000)  # Adjust max_features as needed

# Fit and transform the data
x = vectorizer.fit_transform(X)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Now, x_train, x_test contain the preprocessed and vectorized text data,
# and y_train, y_test contain the corresponding labels

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(x_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9846397108651457
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      4494
           1       0.98      0.99      0.98      4360

    accuracy                           0.98      8854
   macro avg       0.98      0.98      0.98      8854
weighted avg       0.98      0.98      0.98      8854

