In [6]:
import pandas as pd

df = pd.read_csv("xy_train.csv")  # Replace with your actual path if needed
print(df.head())
print(df['label'].value_counts())


   ID                                               text  label
0   0  Americans Aren't Sure If Flight 370 Vanished T...      1
1   1  Pope Leo X offering indulgences to sinners sho...      0
2   2  News: 5 Uplifting Hypotheticals Of What Could ...      0
3   3  George W. Bush and Jeb Bush at a campaign stop...      0
4   4  The WADA requesting their fair share from Lanc...      0
label
0    25798
1    22015
2      187
Name: count, dtype: int64


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters and digits
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Title length
df['title_length'] = df['text'].apply(len)

# Number of exclamation marks
df['exclamations'] = df['text'].apply(lambda x: x.count('!'))

# Number of all-caps words
df['all_caps'] = df['text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df['clean_text'])

# Combine with custom features
import numpy as np
X_additional = df[['title_length', 'exclamations', 'all_caps']].values
X = hstack([X_tfidf, X_additional])

y = df['label']


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Scale features (recommended for convergence)
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrix
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build model with more iterations
model = LogisticRegression(max_iter=3000, solver='lbfgs')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)

# Print classification report
print(classification_report(y_test, y_pred, zero_division=0))

# ROC AUC for multiclass (One-vs-Rest)
print("ROC AUC Score (OvR):", roc_auc_score(y_test, y_proba, multi_class='ovr'))


              precision    recall  f1-score   support

           0       0.78      0.78      0.78      5102
           1       0.75      0.75      0.75      4466
           2       0.20      0.09      0.13        32

    accuracy                           0.76      9600
   macro avg       0.58      0.54      0.55      9600
weighted avg       0.76      0.76      0.76      9600

ROC AUC Score (OvR): 0.7497442803274911


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10]
}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best ROC AUC:", grid.best_score_)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 388,

Best parameters: {'C': 0.1}
Best ROC AUC: nan


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
def predict_fake_news(input_text):
    clean = preprocess(input_text)
    vec = tfidf.transform([clean])
    extra = np.array([[len(input_text), input_text.count('!'), sum(1 for word in input_text.split() if word.isupper())]])
    combined = hstack([vec, extra])
    pred = model.predict(combined)[0]
    return "Fake" if pred == 1 else "Real"

# Example
print(predict_fake_news("Breaking: President resigns over secret scandal!"))


Real
