In [1]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.8-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.8-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading cmudict-1.1.1-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.7/939.7 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.1.1 pyphen-0.17.2 textstat-0.7.8


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout
from transformers import AutoTokenizer, AutoModel
import torch
import fasttext
from textstat import flesch_reading_ease
import unicodedata
import string
from sklearn.model_selection import KFold
import warnings

2025-07-31 12:30:31.121673: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753965031.383137      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753965031.452335      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
warnings.filterwarnings('ignore')

In [4]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7af8f5288cd0>

In [5]:
# --- Data Loading ---
def read_texts_from_dir(dir_path):
    """
    Reads text pairs from a directory and returns a DataFrame with columns ['id', 'file_1', 'file_2'].
    """
    dir_count = sum(os.path.isdir(os.path.join(root, d)) for root, dirs, _ in os.walk(dir_path) for d in dirs)
    data = [0] * dir_count
    i = 0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                with open(os.path.join(folder_path, 'file_1.txt'), 'r', encoding='utf-8') as f1:
                    text1 = f1.read().strip()
                with open(os.path.join(folder_path, 'file_2.txt'), 'r', encoding='utf-8') as f2:
                    text2 = f2.read().strip()
                index = int(folder_name[-4:])
                data[i] = (index, text1, text2)
                i += 1
            except Exception as e:
                print(f"Error reading directory {folder_name}: {e}")
    df = pd.DataFrame(data, columns=['id', 'file_1', 'file_2']).set_index('id')
    return df

In [6]:
# Load train and test data
train_path = "/kaggle/input/fake-or-real-the-impostor-hunt/data/train"
test_path = "/kaggle/input/fake-or-real-the-impostor-hunt/data/test"
df_train = read_texts_from_dir(train_path)
df_test = read_texts_from_dir(test_path)
df_train_gt = pd.read_csv("/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv")

In [7]:
# --- Feature Engineering ---
def clean_text(text):
    """Clean text by removing punctuation and normalizing."""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    delete = str.maketrans('', '', string.punctuation + '\n')
    return text.translate(delete)

def get_text_stats(text):
    """Compute text statistics (length and readability)."""
    return {
        'length': len(text.split()),
        'readability': flesch_reading_ease(text)
    }

In [8]:
# BERT Embeddings
def get_bert_embeddings(texts, model_name='bert-base-uncased', max_length=512):
    """Generate BERT embeddings for a list of texts."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding=True)
            outputs = model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(emb)
    return np.array(embeddings)

In [9]:
import urllib.request

def download_fasttext_model(model_path='/kaggle/working/cc.en.300.bin'):
    """Download FastText English model if not already present."""
    if not os.path.exists(model_path):
        print("Downloading FastText model (compressed)...")
        url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
        gz_path = model_path + '.gz'
        urllib.request.urlretrieve(url, gz_path)

        # Extract .gz
        import gzip
        import shutil
        with gzip.open(gz_path, 'rb') as f_in:
            with open(model_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)
        print("Download complete.")
    return model_path

def get_fasttext_embeddings(texts, model_path='/kaggle/working/cc.en.300.bin'):
    """Generate FastText embeddings for a list of texts."""
    model_path = download_fasttext_model(model_path)  # this ensures the model is downloaded to the correct path
    model = fasttext.load_model(model_path)
    embeddings = []
    for text in texts:
        words = clean_text(text).split()
        word_vectors = [model.get_word_vector(word) for word in words if word]
        emb = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(300)
        embeddings.append(emb)
    return np.array(embeddings)

In [10]:
# Prepare features
def prepare_features(df, use_bert=True, use_fasttext=True):
    """Extract features (BERT/FastText embeddings + text stats) for text pairs."""
    features = []
    for idx in df.index:
        text1, text2 = df.loc[idx, 'file_1'], df.loc[idx, 'file_2']
        text1_clean, text2_clean = clean_text(text1), clean_text(text2)
        
        # Text statistics
        stats1 = get_text_stats(text1_clean)
        stats2 = get_text_stats(text2_clean)
        stats_diff = np.array([stats1['length'] - stats2['length'], stats1['readability'] - stats2['readability']])
        
        # Embeddings
        if use_bert:
            bert_emb = get_bert_embeddings([text1, text2])
            bert_diff = bert_emb[0] - bert_emb[1]
        else:
            bert_diff = np.zeros(768)
        
        if use_fasttext:
            fasttext_emb = get_fasttext_embeddings([text1, text2])
            fasttext_diff = fasttext_emb[0] - fasttext_emb[1]
        else:
            fasttext_diff = np.zeros(300)
        
        # Combine features
        feature_vector = np.concatenate([bert_diff, fasttext_diff, stats_diff])
        features.append(feature_vector)
    return np.array(features)

In [11]:
# --- CNN Model ---
def build_cnn_model(input_dim):
    """Build a 1D CNN model for text classification."""
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(input_dim, 1)),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [12]:
# --- Model Training and Comparison ---
def train_and_evaluate(X, y, model_type, input_dim=None):
    """Train a model and evaluate using cross-validation."""
    if model_type == 'xgboost':
        model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
        scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
        model.fit(X, y)
        return model, scores.mean()
    
    elif model_type == 'mlp':
        model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
        scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
        model.fit(X, y)
        return model, scores.mean()
    
    elif model_type == 'cnn':
        X_reshaped = X.reshape(X.shape[0], X.shape[1], 1)
        model = build_cnn_model(input_dim)
        scores = []
        for train_idx, val_idx in KFold(n_splits=5, shuffle=True, random_state=42).split(X):
            X_train, X_val = X_reshaped[train_idx], X_reshaped[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
            _, acc = model.evaluate(X_val, y_val, verbose=0)
            scores.append(acc)
        model.fit(X_reshaped, y, epochs=10, batch_size=32, verbose=0)
        return model, np.mean(scores)

In [13]:
# Prepare training data
X_train_bert = prepare_features(df_train, use_bert=True, use_fasttext=False)
X_train_fasttext = prepare_features(df_train, use_bert=False, use_fasttext=True)
X_train_combined = prepare_features(df_train, use_bert=True, use_fasttext=True)
y_train = (df_train_gt['real_text_id'] == 1).astype(int)  # 1 if file_1 is real, 0 if file_2 is real

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading FastText model (compressed)...
Download complete.


In [14]:
# Train and compare models
print("Evaluating XGBoost with BERT features...")
xgb_bert_model, xgb_bert_score = train_and_evaluate(X_train_bert, y_train, 'xgboost')
print("Evaluating XGBoost with FastText features...")
xgb_fasttext_model, xgb_fasttext_score = train_and_evaluate(X_train_fasttext, y_train, 'xgboost')
print("Evaluating XGBoost with combined features...")
xgb_combined_model, xgb_combined_score = train_and_evaluate(X_train_combined, y_train, 'xgboost')

print("Evaluating MLP with BERT features...")
mlp_bert_model, mlp_bert_score = train_and_evaluate(X_train_bert, y_train, 'mlp')
print("Evaluating MLP with FastText features...")
mlp_fasttext_model, mlp_fasttext_score = train_and_evaluate(X_train_fasttext, y_train, 'mlp')
print("Evaluating MLP with combined features...")
mlp_combined_model, mlp_combined_score = train_and_evaluate(X_train_combined, y_train, 'mlp')

print("Evaluating CNN with BERT features...")
cnn_bert_model, cnn_bert_score = train_and_evaluate(X_train_bert, y_train, 'cnn', input_dim=X_train_bert.shape[1])
print("Evaluating CNN with FastText features...")
cnn_fasttext_model, cnn_fasttext_score = train_and_evaluate(X_train_fasttext, y_train, 'cnn', input_dim=X_train_fasttext.shape[1])
print("Evaluating CNN with combined features...")
cnn_combined_model, cnn_combined_score = train_and_evaluate(X_train_combined, y_train, 'cnn', input_dim=X_train_combined.shape[1])

Evaluating XGBoost with BERT features...
Evaluating XGBoost with FastText features...
Evaluating XGBoost with combined features...
Evaluating MLP with BERT features...
Evaluating MLP with FastText features...
Evaluating MLP with combined features...
Evaluating CNN with BERT features...


2025-07-31 13:09:27.327409: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Evaluating CNN with FastText features...
Evaluating CNN with combined features...


In [15]:
# Select best model
scores = {
    'xgb_bert': xgb_bert_score, 'xgb_fasttext': xgb_fasttext_score, 'xgb_combined': xgb_combined_score,
    'mlp_bert': mlp_bert_score, 'mlp_fasttext': mlp_fasttext_score, 'mlp_combined': mlp_combined_score,
    'cnn_bert': cnn_bert_score, 'cnn_fasttext': cnn_fasttext_score, 'cnn_combined': cnn_combined_score
}
best_model_name = max(scores, key=scores.get)
best_score = scores[best_model_name]
best_model = locals()[best_model_name + '_model']
best_features = X_train_bert if 'bert' in best_model_name else X_train_fasttext if 'fasttext' in best_model_name else X_train_combined
print(f"Selected model: {best_model_name} with Cross-Validation Accuracy: {best_score:.4f}")

Selected model: xgb_combined with Cross-Validation Accuracy: 0.9368


In [16]:
# --- Predictions on Test Set ---
X_test = prepare_features(df_test, use_bert=('bert' in best_model_name or 'combined' in best_model_name), 
                         use_fasttext=('fasttext' in best_model_name or 'combined' in best_model_name))
if 'cnn' in best_model_name:
    X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    predictions = (best_model.predict(X_test_reshaped) > 0.5).astype(int)
else:
    predictions = best_model.predict(X_test)
predictions = [1 if pred == 1 else 2 for pred in predictions]  # Convert to 1 (file_1) or 2 (file_2)

In [17]:
# --- Submission ---
output_df = pd.DataFrame({'id': df_test.index, 'real_text_id': predictions})
output_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")

Submission file 'submission.csv' created successfully!
