In [None]:
! pip install optuna

In [None]:
#ファイル
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/grad_comp

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/grad_comp


In [None]:
# ライブラリの導入

import numbers
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from functools import partial
import scipy as sp
def adapt_labels(labels):
  max_label = np.max(labels) # 2 or 4

  if max_label == 2:
    labels += 2
  elif max_label == 4:
    labels -= 2
  else:
    print('Exception error! please check with np.unique(labels).')
  return labels

In [None]:
# data
# preprocess path
prepro = 'sudachi/A'
# n_features = 10000

f = open('preprocess/' + prepro + '/text.prep_train.txt', 'r')
train_data = f.read()
train_data = train_data.split('\n')
del train_data[30000]
#x_train = np.zeros((30000, len(res)))

f = open('preprocess/' + prepro + '/text.prep_test.txt', 'r')
test_data = f.read()
test_data = test_data.split('\n')
del test_data[2500]
#x_test = np.zeros((2500, len(res)))

f = open('preprocess/' + prepro + '/text.prep_dev.txt', 'r')
dev_data = f.read()
dev_data = dev_data.split('\n')
del dev_data[2500]
#x_dev = np.zeros((2500, len(res)))

# label
y_train = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.train.txt')
y_dev = np.loadtxt('/content/drive/MyDrive/grad_comp/data/label.dev.txt')
# pseudo_data

pseudo_data = np.loadtxt('/content/drive/MyDrive/grad_comp/pseudo/C5--bipseudo-3e-4-491.txt')
train_data = np.concatenate([train_data,test_data])
y_train = np.concatenate([y_train, pseudo_data])

y_train = adapt_labels(y_train)
y_dev = adapt_labels(y_dev)

print(train_data.shape,y_train.shape)

(32500,) (32500,)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(train_data)
x_dev = vectorizer.transform(dev_data)
x_test = vectorizer.transform(test_data)

In [None]:
import optuna
import pandas as pd

def cal_qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

class OptunaRounder:
    def __init__(self, y_true, y_pred):
        # 真のラベル
        self.y_true = y_true
        # 予測したラベル
        self.y_pred = y_pred
        # ラベルの種類
        self.labels = np.unique(y_true)

    def __call__(self, trial):
        # 閾値を Define by run で追加していく
        thresholds = []
        # ラベルの数 - 1 が必要な閾値の数になる
        for i in range(len(self.labels) - 1):
            # 閾値の下限 (既存の最大 or ラベルの最小値)
            low = max(thresholds) if i > 0 else min(self.labels)
            # 閾値の上限 (ラベルの最大値)
            high = max(self.labels)
            # 閾値の候補を追加する
            t = trial.suggest_uniform(f't{i}', low, high)
            thresholds.append(t)

        # 閾値の候補を元に QWK を計算する
        opt_y_pred = self.adjust(self.y_pred, thresholds)
        return cal_qwk(self.y_true, opt_y_pred)

    def adjust(self, y_pred, thresholds):
        opt_y_pred = pd.cut(y_pred,
                            [-np.inf] + thresholds + [np.inf],
                            labels=self.labels)
        return opt_y_pred

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
import numpy as np

# 最適なハイパーパラメタ C を探索
best_qwk = 0
best_c = 1
for c in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    model = LinearSVR(C=c, random_state=0)
    model.fit(x_train,y_train)
    y_pred = model.predict(x_dev)
    y_pred = np.clip(y_pred, 0, 4).round()
    _qwk = cohen_kappa_score(y_pred, y_dev, weights='quadratic')
    if _qwk > best_qwk:
        best_qwk = _qwk
        best_c = c
    print("QWK = %.3f  C = %s" % (_qwk, str(c)))
print("最適なハイパーパラメタは C = %s" % str(best_c))

QWK = 0.309  C = 0.1
QWK = 0.359  C = 0.2
QWK = 0.373  C = 0.3
QWK = 0.378  C = 0.4
QWK = 0.382  C = 0.5
QWK = 0.387  C = 0.6
QWK = 0.391  C = 0.7
QWK = 0.392  C = 0.8
QWK = 0.392  C = 0.9
QWK = 0.391  C = 1
最適なハイパーパラメタは C = 0.9


In [None]:
model = LinearSVR(C=best_c, random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_dev)
y_pred = np.clip(y_pred, 0, 4).round()

In [None]:
# Optuna を使って QWK の閾値を最適化する
objective = OptunaRounder(y_dev, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=200)

In [None]:
best_thresholds = sorted(study.best_params.values())
best_score = study.best_value

print(best_thresholds)
print(best_score)

[1.4313562776985096, 1.5658335744920155, 2.123602544546043, 2.264890179183845]
0.4178975070694815


In [None]:
model = LinearSVR(C=best_c, random_state=0)
model.fit(x_train,y_train)
dev_pred = model.predict(x_dev)

In [None]:
dev_pred_qwk = objective.adjust(dev_pred, best_thresholds)
print(dev_pred_qwk)

In [None]:
# 検証用データに対する評価
dev_qwk = cohen_kappa_score(y_dev, dev_pred_qwk, weights='quadratic')
print(best_qwk,dev_qwk)

# 推論

In [None]:
prepro_ = prepro[:-2] + prepro[-1]

model = LinearSVR(C=best_c, random_state=0)
model.fit(x_train,y_train)
test_pred = model.predict(x_test)
test_pred_qwk = objective.adjust(test_pred, best_thresholds)
print(test_pred_qwk)

with open('optuna_pse-svr_eval_' + prepro_ + '.txt','w') as f:
  for y_pred in test_pred_qwk:
    y_pred = int(y_pred)
    y_pred -= 2
    y_pred = str(y_pred)
    f.write(y_pred + '\n')