In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    accuracy_score,
    classification_report,
    confusion_matrix
)

import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [None]:
df = pd.read_json("data/news_corpus.jsonl", lines=True)

print("Dataset shape:", df.shape)
df.head()

In [None]:
X_text = df["text"]
y = df["label"]

vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(X_text)

print("TF-IDF shape:", X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

r2 = r2_score(y_test, y_pred_lr)
mse = mean_squared_error(y_test, y_pred_lr)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_lr)

print("===== LINEAR REGRESSION RESULTS =====")
print("R2:", round(r2, 4))
print("RMSE:", round(rmse, 4))
print("MAE:", round(mae, 4))

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("===== RANDOM FOREST RESULTS =====")
print("R2:", round(r2_rf, 4))
print("RMSE:", round(rmse_rf, 4))
print("MAE:", round(mae_rf, 4))

In [None]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

y_pred_class = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_class)

print("===== LOGISTIC REGRESSION (CLASSIFICATION) =====")
print("Accuracy:", round(accuracy, 4))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_class))

In [None]:
cm = confusion_matrix(y_test, y_pred_class)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["Fake","Real"],
            yticklabels=["Fake","Real"])

plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
toon_content = "news[300]{label,text}:\n"

for _, row in df.head(300).iterrows():
    label = row["label"]
    text = row["text"][:100].replace("\n", " ") + "..."
    toon_content += f"{label},{text}\n"

with open("data/news_300.toon", "w", encoding="utf-8") as f:
    f.write(toon_content)

print("TOON file created successfully!")

In [None]:
print(toon_content[:500])

In [None]:
print("========== MODEL SUMMARY ==========")
print("Linear Regression R2:", round(r2,4))
print("Random Forest R2:", round(r2_rf,4))
print("Logistic Regression Accuracy:", round(accuracy,4))