<a href="https://colab.research.google.com/github/IMOKURI/signate-471/blob/main/%E5%8C%BB%E5%AD%A6%E8%AB%96%E6%96%87%E3%81%AE%E8%87%AA%E5%8B%95%E4%BB%95%E5%88%86%E3%81%91%E3%83%81%E3%83%A3%E3%83%AC%E3%83%B3%E3%82%B8_%E3%83%99%E3%83%BC%E3%82%B9%E3%83%A9%E3%82%A4%E3%83%B32.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 医学論文の自動仕分けチャレンジ ベースライン



## 前提

- Google Colab で動かすことを想定します。

## 事前準備

- Google Drive の `マイドライブ/Datasets/signate-471` というフォルダに 本コンペのデータを配置してください。

## アウトプット

- `oof_df.csv`: 交差検証でのモデルの出力などが入っています。
- `submission.csv`: 提出用のファイルです。

## スコア

- CV (交差検証): 
- LB (リーダーボード): 

## 謝辞

この notebook は Kaggle で [@ruchi798](https://www.kaggle.com/ruchi798) さんが別コンペで公開されていた notebook をベースにしています。

## データをドライブからコピー

In [None]:
import os
import sys

from google.colab import drive
drive.mount('/gdrive')

!cp /gdrive/MyDrive/Datasets/signate-471/train.csv .
!cp /gdrive/MyDrive/Datasets/signate-471/test.csv .
!cp /gdrive/MyDrive/Datasets/signate-471/sample_submit.csv .

Mounted at /gdrive


## ライブラリ インストール、インポート

In [None]:
!pip install -q textstat

[K     |████████████████████████████████| 99 kB 3.5 MB/s 
[K     |████████████████████████████████| 2.0 MB 18.5 MB/s 
[?25h

In [None]:
import math
import random
import re
import time
import warnings

import nltk
import numpy as np
import pandas as pd
import textstat
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import fbeta_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [None]:
DATA_DIR = "./"
OUTPUT_DIR = "./"

In [None]:
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

## データ読み込み

In [None]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [None]:
# この値を境に、モデルの出力を 0 と 1 にします。
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)

0.023282372444280715


## 前処理

In [None]:
def preprocess(data):
    
    title_abstract = []
    for e in data:

        # アルファベット以外は空白に置換します。
        e = re.sub("[^a-zA-Z]", " ", e)

        # 小文字に変換します。
        e = e.lower()

        # token に分割します。
        e = nltk.word_tokenize(e)

        # stop word を削除します。
        e = [word for word in e if not word in set(nltk.corpus.stopwords.words("english"))]

        # 見出し語化します。
        lemma = nltk.WordNetLemmatizer()
        e = [lemma.lemmatize(word) for word in e]
        e = " ".join(e)

        title_abstract.append(e)

    return title_abstract

In [None]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    # NaN を空白で埋めます。
    train.fillna("", inplace=True)

    # title と abstract を接続します。
    train["title_abstract"] = train[["title", "abstract"]].agg(" ".join, axis=1)

    train["preprocessed_title_abstract"] = preprocess(train["title_abstract"])

    return train

In [None]:
def get_test_data(test):

    # NaN を空白で埋めます。
    test.fillna("", inplace=True)

    # title と abstract を接続します。
    test["title_abstract"] = test[["title", "abstract"]].agg(" ".join, axis=1)

    test["preprocessed_title_abstract"] = preprocess(test["title_abstract"])

    return test

In [None]:
if os.path.exists("/gdrive/MyDrive/Datasets/signate-471/preprocessed_train.csv"):
    !cp -f /gdrive/MyDrive/Datasets/signate-471/preprocessed_train.csv .
    train = pd.read_csv("preprocessed_train.csv")

else:
    # 時間がかかる(30分くらい？)ので、一度、前処理したものは保存しておきます。
    train = get_train_data(train)
    train.to_csv("preprocessed_train.csv")
    !cp -f preprocessed_train.csv /gdrive/MyDrive/Datasets/signate-471/

In [None]:
if os.path.exists("/gdrive/MyDrive/Datasets/signate-471/preprocessed_test.csv"):
    !cp -f /gdrive/MyDrive/Datasets/signate-471/preprocessed_test.csv .
    test = pd.read_csv("preprocessed_test.csv")

else:
    # 時間がかかる(30分くらい？)ので、一度、前処理したものは保存しておきます。
    test = get_test_data(test)
    test.to_csv("preprocessed_test.csv")
    !cp -f preprocessed_test.csv /gdrive/MyDrive/Datasets/signate-471/

## 学習

In [None]:
def training(model, X_train, y_train, X_test, y_test, model_name):
    t1 = time.time()
    
    model = make_pipeline(
        TfidfVectorizer(binary=True, ngram_range=(1,1)),
        model,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    
    t2 = time.time()
    training_time = t2-t1 
    
    LOGGER.info(f"--- Model: {model_name} ---")
    LOGGER.info(f"MSE: {MSE:.5f}")
    LOGGER.info(f"Training time: {training_time:.5f}")

In [None]:
ridge = Ridge(fit_intercept=True, normalize=False)
lr = LinearRegression()
m = [ridge, lr]
mn = ["Ridge Regression", "Linear Regression"]

X = train["preprocessed_title_abstract"]
y = train['judgement']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=471)

for i in range(0, len(m)):
    training(model=m[i], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, model_name=mn[i])

--- Model: Ridge Regression ---
MSE: 0.01500
Training time: 3.52279
--- Model: Linear Regression ---
MSE: 0.16198
Training time: 47.18153


## 推論

In [None]:
def training_all(model, X, y):
    
    model = make_pipeline(
        TfidfVectorizer(binary=True, ngram_range=(1,1)),
        model,
    )
    model.fit(X, y)
    y_pred = model.predict(test["preprocessed_title_abstract"])
    
    return y_pred

In [None]:
predictions = training_all(ridge, X, y)
predictions = np.where(predictions < border, 0, 1)
sub["judgement"] = predictions
sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)

Unnamed: 0,id,judgement
0,27145,1
1,27146,1
2,27147,0
3,27148,0
4,27149,0
...,...,...
40829,67974,0
40830,67975,0
40831,67976,1
40832,67977,0


In [None]:
sub["judgement"].value_counts()

0    25401
1    15433
Name: judgement, dtype: int64