In [1]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [2]:
PATH_TO_DATA = Path('../input/detecting-generated-scientific-papers/')
INDEX_COL_NAME = 'id'
INPUT_COL_NAME = 'text'
TARGET_COL_NAME = 'fake'

In [3]:
train_df = pd.read_csv("data/train/fake_papers_train_part_public.csv", index_col=INDEX_COL_NAME)
test_df = pd.read_csv("data/test/fake_papers_test_public.csv", index_col=INDEX_COL_NAME)
sample_sumbission_df = pd.read_csv("data/sample/sample_submission.csv", index_col=INDEX_COL_NAME)

In [4]:
print(train_df)

                                                    text  fake
id                                                            
1      Modern two-dimensional imaging is of such qual...     0
2      Background: The optimal sequence of systemic p...     1
5      This chapter opens with a discussion of the ef...     1
10     The time scale of the ultra-short-term can str...     1
23     Electronic nose or machine olfaction are syste...     1
...                                                  ...   ...
26727  In this paper, Shmoop uses statistical methods...     1
26728  In this paper, the authors examine the effects...     1
26734  The possibility of using PCI Mezzanine Cards (...     0
26743  Soil biodiversity is a keystone of the natural...     1
26747  In this chapter, Dr. Barlow and his colleagues...     1

[5350 rows x 2 columns]


In [5]:
train_df.head(2)

Unnamed: 0_level_0,text,fake
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Modern two-dimensional imaging is of such qual...,0
2,Background: The optimal sequence of systemic p...,1


In [6]:
train_df[TARGET_COL_NAME].value_counts()

1    3664
0    1686
Name: fake, dtype: int64

In [7]:
# input text length (in words) distribution 
train_df[INPUT_COL_NAME].apply(lambda s: len(s.split())).describe()

count    5350.000000
mean      139.928037
std        70.357633
min        50.000000
25%        99.000000
50%       116.000000
75%       168.000000
max      1649.000000
Name: text, dtype: float64

In [8]:
tfidf_transformer = TfidfVectorizer(
    ngram_range=(1, 2),
    analyzer='word',
    lowercase=True,
    max_features=50000,
    stop_words='english'
)

logreg = LogisticRegression(
    C=1,
    random_state=17,
    solver='lbfgs',
    n_jobs=4,
    max_iter=500
)

model = Pipeline([
    ('tfidf', tfidf_transformer), 
    ('logit', logreg)
])

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [10]:
cv_f1_scores = []
test_pred_scores = np.zeros([len(test_df), 2])
skf_split_generator = skf.split(X=train_df[INPUT_COL_NAME], y=train_df[TARGET_COL_NAME])

for fold_id, (train_idx, val_idx) in tqdm(enumerate(skf_split_generator)):
    curr_train_df = train_df.iloc[train_idx]
    curr_val_df = train_df.iloc[val_idx]
    
    model.fit(X=curr_train_df[INPUT_COL_NAME], y=curr_train_df[TARGET_COL_NAME])
    
    # making predictions for the current validation set
    curr_preds = model.predict(X=curr_val_df[INPUT_COL_NAME])
    curr_f1 = f1_score(y_true=curr_val_df[TARGET_COL_NAME], y_pred=curr_preds)
    cv_f1_scores.append(curr_f1)
    print(f"F1-score for fold {fold_id} is {curr_f1:.3}.")
    
    # making predictions for the test set
    curr_test_pred_scores = model.predict_proba(X=test_df[INPUT_COL_NAME])
    test_pred_scores += curr_test_pred_scores

print(f'Average cross-validation F1-score is {np.mean(cv_f1_scores):.3} +/- {np.std(cv_f1_scores):.3}.')
test_pred_scores /= skf.n_splits

0it [00:00, ?it/s]

F1-score for fold 0 is 0.877.
F1-score for fold 1 is 0.878.
F1-score for fold 2 is 0.874.
F1-score for fold 3 is 0.872.
F1-score for fold 4 is 0.869.
Average cross-validation F1-score is 0.874 +/- 0.00322.


In [11]:
THRESHOLD = 0.5 # this can be tuned via cross-validation
test_preds = (test_pred_scores[:, 1] >= THRESHOLD).astype('uint8')

In [12]:
subm_df = sample_sumbission_df.copy()
subm_df[TARGET_COL_NAME] = test_preds
subm_df.to_csv('submission.csv')