In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_data = pd.read_csv("/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv")
train_data.head()

Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,2
4,4,2


In [3]:

import os
import re
import glob
import gc
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin


In [4]:

DATA_ROOT = Path("/kaggle/input/fake-or-real-the-impostor-hunt")
TRAIN_DIR = DATA_ROOT / "data" / "train"
TEST_DIR  = DATA_ROOT / "data" / "test"
OUTPUT_PATH = Path("/kaggle/working/submission.csv")


In [5]:

def find_labels_csv(root: Path):
    """
    Recursively search for a CSV file that contains columns ['id', 'real_text_id'].
    Returns the first match as a DataFrame.
    """
    candidates = list(root.rglob("*.csv"))
    for c in candidates:
        try:
            df = pd.read_csv(c)
            cols = {col.lower() for col in df.columns}
            if 'id' in cols and 'real_text_id' in cols:
                print(f"[Info] Using labels file: {c}")
                # Normalize column names just in case
                df = df.rename(columns={col: col.lower() for col in df.columns})
                return df
        except Exception:
            continue
    raise FileNotFoundError("No CSV with columns ['id', 'real_text_id'] found under dataset root.")


In [6]:

def extract_article_id(article_dir_name: str) -> int:
    """
    'article_0000' -> 0
    """
    m = re.search(r'(\d+)$', article_dir_name)
    if not m:
        raise ValueError(f"Could not parse integer id from directory name: {article_dir_name}")
    return int(m.group(1))


In [7]:

def read_text_file(path: Path) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        t = f.read()
    # Minimal normalization
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    return t


In [8]:

def load_articles_as_rows(split_dir: Path) -> pd.DataFrame:
    """
    Returns DataFrame with columns: id (int), file_id (1 or 2), text (str)
    """
    rows = []
    article_dirs = sorted([p for p in split_dir.iterdir() if p.is_dir()])
    if not article_dirs:
        raise FileNotFoundError(f"No article directories found under: {split_dir}")

    for art_dir in article_dirs:
        art_id = extract_article_id(art_dir.name)
        for file_id in [1, 2]:
            txt_path = art_dir / f"file_{file_id}.txt"
            if not txt_path.exists():
                raise FileNotFoundError(f"Missing file: {txt_path}")
            text = read_text_file(txt_path)
            rows.append({"id": art_id, "file_id": file_id, "text": text})
    return pd.DataFrame(rows).sort_values(by=["id", "file_id"]).reset_index(drop=True)


In [9]:

def build_training_df(train_dir: Path, labels_df: pd.DataFrame) -> pd.DataFrame:
    texts_df = load_articles_as_rows(train_dir)
    # Merge labels
    df = texts_df.merge(labels_df[['id','real_text_id']], on='id', how='left')
    if df['real_text_id'].isna().any():
        n_missing = df['real_text_id'].isna().sum()
        raise ValueError(f"Labels missing for {n_missing} articles. Check labels CSV.")
    df['label'] = (df['file_id'] == df['real_text_id']).astype(int)
    return df[['id','file_id','text','label']]


In [10]:

def pairwise_accuracy_from_scores(df_scores: pd.DataFrame) -> float:
    """
    df_scores: columns ['id', 'file_id', 'score', 'real_text_id']
    Pick file with higher score per id as predicted real_text_id, compare to real.
    """
    # Choose file with higher score per article id
    chosen = df_scores.sort_values(by=['id','score'], ascending=[True, False]).groupby('id', as_index=False).first()
    acc = (chosen['file_id'] == chosen['real_text_id']).mean()
    return acc


In [11]:

word_tfidf = TfidfVectorizer(
    lowercase=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    max_features=150_000,
    sublinear_tf=True
)


In [12]:

char_tfidf = TfidfVectorizer(
    lowercase=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(3, 5),
    min_df=2,
    max_df=0.95,
    max_features=100_000,
    sublinear_tf=True
)


In [13]:

vectorizer = FeatureUnion([
    ('word', word_tfidf),
    ('char', char_tfidf),
])


In [14]:
model = LinearSVC(C=1.0)

In [15]:

labels_df = find_labels_csv(DATA_ROOT)
train_df = build_training_df(TRAIN_DIR, labels_df)
test_df  = load_articles_as_rows(TEST_DIR)

print(f"[Info] Train rows: {len(train_df)} (should be ~ 2 * number of articles)")
print(f"[Info] Test rows:  {len(test_df)} (should be ~ 2 * number of test articles)")
print(train_df.head())
print(test_df.head())


[Info] Using labels file: /kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv
[Info] Train rows: 190 (should be ~ 2 * number of articles)
[Info] Test rows:  2136 (should be ~ 2 * number of test articles)
   id  file_id                                               text  label
0   0        1  The VIRSA (Visible Infrared Survey Telescope A...      1
1   0        2  The China relay network has released a signifi...      0
2   1        1  China\nThe goal of this project involves achie...      0
3   1        2  The project aims to achieve an accuracy level ...      1
4   2        1  Scientists can learn about how galaxies form a...      1
   id  file_id                                               text
0   0        1  "Music" Music music music Music music Music mu...
1   0        2  Since its launch on Paranal observatory's Very...
2   1        1  underground exploration on SN's birth has prov...
3   1        2  SN 1987A provides valuable insights as newer o...
4   2        1  This

In [16]:

X_text = train_df['text'].values
y = train_df['label'].values
groups = train_df['id'].values


In [17]:

gkf = GroupKFold(n_splits=5)
fold_accs = []


In [18]:

print("\n[CV] Starting 5-fold GroupKFold (grouped by article id)...")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_text, y, groups), 1):
    X_tr, y_tr = X_text[tr_idx], y[tr_idx]
    X_va = X_text[va_idx]

    # Fit vectorizer on train fold only
    X_tr_vec = vectorizer.fit_transform(X_tr)
    clf = model.fit(X_tr_vec, y_tr)

    # Scores on validation (decision_function is fine to compare)
    X_va_vec = vectorizer.transform(X_va)
    scores = clf.decision_function(X_va_vec)

    # Build a scoring DF with real labels for pairwise evaluation
    fold_df = train_df.iloc[va_idx].copy()
    fold_df['score'] = scores
    # attach real_text_id for pairwise accuracy
    fold_df = fold_df.merge(labels_df[['id','real_text_id']], on='id', how='left')

    acc = pairwise_accuracy_from_scores(fold_df[['id','file_id','score','real_text_id']])
    fold_accs.append(acc)
    print(f"[CV] Fold {fold}: Pairwise accuracy = {acc:.4f}")

print(f"[CV] Mean Pairwise Accuracy: {np.mean(fold_accs):.4f} ± {np.std(fold_accs):.4f}")



[CV] Starting 5-fold GroupKFold (grouped by article id)...
[CV] Fold 1: Pairwise accuracy = 0.8947
[CV] Fold 2: Pairwise accuracy = 0.9474
[CV] Fold 3: Pairwise accuracy = 1.0000
[CV] Fold 4: Pairwise accuracy = 0.8947
[CV] Fold 5: Pairwise accuracy = 0.9474
[CV] Mean Pairwise Accuracy: 0.9368 ± 0.0394


In [19]:

print("\n[Train] Fitting on full training data...")
X_vec_full = vectorizer.fit_transform(train_df['text'].values)
clf_full = model.fit(X_vec_full, train_df['label'].values)



[Train] Fitting on full training data...


In [20]:

print("[Predict] Scoring test texts...")
X_test_vec = vectorizer.transform(test_df['text'].values)
test_scores = clf_full.decision_function(X_test_vec)

test_df_scored = test_df.copy()
test_df_scored['score'] = test_scores


[Predict] Scoring test texts...


In [21]:

chosen = test_df_scored.sort_values(by=['id','score'], ascending=[True, False]).groupby('id', as_index=False).first()


In [22]:

submission = chosen[['id','file_id']].rename(columns={'file_id':'real_text_id'}).sort_values('id')
submission.to_csv(OUTPUT_PATH, index=False)
print(f"[Done] Submission saved to: {OUTPUT_PATH}")
print(submission.head(10))


[Done] Submission saved to: /kaggle/working/submission.csv
   id  real_text_id
0   0             2
1   1             2
2   2             1
3   3             1
4   4             2
5   5             1
6   6             2
7   7             1
8   8             2
9   9             1


In [23]:
submission.describe()

Unnamed: 0,id,real_text_id
count,1068.0,1068.0
mean,533.5,1.48221
std,308.449348,0.499918
min,0.0,1.0
25%,266.75,1.0
50%,533.5,1.0
75%,800.25,2.0
max,1067.0,2.0
