In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Step 1: Load the dataset
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

# Inspect the first few rows
print("Dataset preview:")
print(fake.head())

Dataset preview:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [3]:
print(fake['text'])

0        Donald Trump just couldn t wish all Americans ...
1        House Intelligence Committee Chairman Devin Nu...
2        On Friday, it was revealed that former Milwauk...
3        On Christmas day, Donald Trump announced that ...
4        Pope Francis used his annual Christmas Day mes...
                               ...                        
23476    21st Century Wire says As 21WIRE reported earl...
23477    21st Century Wire says It s a familiar theme. ...
23478    Patrick Henningsen  21st Century WireRemember ...
23479    21st Century Wire says Al Jazeera America will...
23480    21st Century Wire says As 21WIRE predicted in ...
Name: text, Length: 23481, dtype: object


In [4]:
print(true['text'])

0        WASHINGTON (Reuters) - The head of a conservat...
1        WASHINGTON (Reuters) - Transgender people will...
2        WASHINGTON (Reuters) - The special counsel inv...
3        WASHINGTON (Reuters) - Trump campaign adviser ...
4        SEATTLE/WASHINGTON (Reuters) - President Donal...
                               ...                        
21412    BRUSSELS (Reuters) - NATO allies on Tuesday we...
21413    LONDON (Reuters) - LexisNexis, a provider of l...
21414    MINSK (Reuters) - In the shadow of disused Sov...
21415    MOSCOW (Reuters) - Vatican Secretary of State ...
21416    JAKARTA (Reuters) - Indonesia will buy 11 Sukh...
Name: text, Length: 21417, dtype: object


In [5]:
merge = pd.concat([true['text'],fake['text']])
merge

0        WASHINGTON (Reuters) - The head of a conservat...
1        WASHINGTON (Reuters) - Transgender people will...
2        WASHINGTON (Reuters) - The special counsel inv...
3        WASHINGTON (Reuters) - Trump campaign adviser ...
4        SEATTLE/WASHINGTON (Reuters) - President Donal...
                               ...                        
23476    21st Century Wire says As 21WIRE reported earl...
23477    21st Century Wire says It s a familiar theme. ...
23478    Patrick Henningsen  21st Century WireRemember ...
23479    21st Century Wire says Al Jazeera America will...
23480    21st Century Wire says As 21WIRE predicted in ...
Name: text, Length: 44898, dtype: object

In [6]:
len(true)

21417

In [7]:
len(fake)

23481

In [8]:
data = ["true"] * 21417 + ["fake"] * 23481

In [9]:
label = pd.DataFrame({"label": data})
label['label']

0        true
1        true
2        true
3        true
4        true
         ... 
44893    fake
44894    fake
44895    fake
44896    fake
44897    fake
Name: label, Length: 44898, dtype: object

In [10]:
# Step 2: Prepare features and labels
# Assuming 'text' contains the news articles and 'label' contains REAL/FAKE
X = merge
y = label['label']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 
#stop_words='english'：這表示使用內建的英語停用詞（stop words）列表，會自動忽略常見但沒有重要意義的單詞，例如 "the"、"is"、"and" 等，以減少噪音，提高文本表示的質量。 
# max_df 控制的是忽略出現在過多文檔中的詞彙。可以避免一些常見但無法提供有效信息的詞。

# Fit and transform the training data, transform the test data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [12]:
# Step 4: Train the PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(max_iter=50)
clf.fit(X_train_vectorized, y_train)

In [13]:
# Step 5: Make predictions and evaluate the model
y_pred = clf.predict(X_test_vectorized)

In [14]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Accuracy: 99.44%


In [15]:
# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[4623   27]
 [  23 4307]]


In [16]:
# Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        fake       1.00      0.99      0.99      4650
        true       0.99      0.99      0.99      4330

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [17]:
# Step 6: Function to predict new articles
def predict_news(text):
    # Vectorize the input text
    text_vectorized = vectorizer.transform([text])
    # Predict using the trained classifier
    prediction = clf.predict(text_vectorized)
    return prediction[0]


In [18]:
# Example usage
sample_text = "U.S. appeals court rejects challenge to Trump voter fraud panel"
result = predict_news(sample_text)
print(f"\nPrediction for sample text: {result}")


Prediction for sample text: true
