

1.   train・dev・testのコーパスを目視で整える
2.   neologdnで正規化し、sudachiで単語分割する
3.   単語分割したものをTfidfでベクトル化する
4.   LightGBMとCatBoostとSVRで学習し、学習結果をファイルに出力
5.  出力したファイルを、重みづけしながらアンサンブルを行う




In [None]:
import numpy
numpy.set_printoptions(threshold=numpy.inf)

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [None]:
def load_data(openfile):
    with open(openfile, 'r') as f:
        text = f.read().split("\n")
        text = text[:-1] # train_textの最後に''があるため削除
    return text

In [None]:
path = "/content/drive/MyDrive/研究室コンペ/self_data2/"

# データの読み込み
train_text = load_data( path + "text.train.txt") # 訓練用 30000
dev_text = load_data( path + "text.dev.txt") # 検証用 2500
test_text = load_data( path + "text.test.txt") # 提出用 2500

# ラベルの読み込み
train_label = load_data( path + "label.train.txt")
dev_label = load_data( path + "label.dev.txt")

In [None]:
# 顔文字除去X
train_text_demoji = train_text
test_text_demoji = test_text
dev_text_demoji = dev_text

In [None]:
#sudachiをインストール
! pip install sudachipy
! pip install sudachidict_full
! pip install sudachidict_small

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sudachipy
  Downloading SudachiPy-0.6.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sudachipy
Successfully installed sudachipy-0.6.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sudachidict_full
  Downloading SudachiDict-full-20221021.tar.gz (9.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sudachidict_full
  Building wheel for sudachidict_full (setup.py) ... [?25l[?25hdone
  Created wheel for sudachidict_full: filename=SudachiDict_full-20221021-py3-none-any.whl size=126781795 sha256=006893775940f1a69f4b19540b8e980276d43342d37be9ed5c6609ed32a056ce


In [None]:
#sudachiによる正規化を行う関数を定義
from sudachipy import Dictionary
from sudachipy import SplitMode
tokenizer = Dictionary(dict="small").create()

def sudachi(text):
    after = list()
    for token in tokenizer.tokenize(text, SplitMode.C):
        word = token.normalized_form() # 正規化あり
        pos = " ".join(token.part_of_speech())
        
        if word.isnumeric():
            word = '0'

        after.append(word)

    return after

In [None]:
! pip install neologdn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neologdn
  Downloading neologdn-0.5.1.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: neologdn
  Building wheel for neologdn (setup.py) ... [?25l[?25hdone
  Created wheel for neologdn: filename=neologdn-0.5.1-cp38-cp38-linux_x86_64.whl size=178326 sha256=a556e8daf6c6759af30b1ae60a7108dd57e27c9db386c57bc4790953f013f7a0
  Stored in directory: /root/.cache/pip/wheels/3e/db/10/b3b26caa63c5da86ea3a25043cc4379a66bb3dd30d6f060a37
Successfully built neologdn
Installing collected packages: neologdn
Successfully installed neologdn-0.5.1


In [None]:
# 単語分割
import neologdn

train_tokenize = [] 
dev_tokenize = []
test_tokenize = []

def tokenize(infile, outfile):
    for i in range(len(infile)):
        outfile.append(sudachi(neologdn.normalize(infile[i])))    #正規化あり

tokenize(train_text_demoji, train_tokenize)
tokenize(dev_text_demoji, dev_tokenize)
tokenize(test_text_demoji, test_tokenize)

In [None]:
def writefile(infile, outfile):
    with open(outfile, 'w') as f:
        for i, wordlist in enumerate(infile):
            f.write(" ".join([str(word) for word in wordlist]) + '\n')

writefile(train_tokenize, "train.txt")
writefile(dev_tokenize, "dev.txt")
writefile(test_tokenize, "test.txt")

In [None]:
# Tokenizeしたデータを読み込み
with open("train.txt", 'r') as f:
    traintext = f.read().split("\n")
    traintext = traintext[:-1]
with open("test.txt", 'r') as f:
    testtext = f.read().split("\n")
    testtext = testtext[:-1]
with open("dev.txt", 'r') as f:
    devtext = f.read().split("\n")
    devtext = devtext[:-1]

vectorizer = TfidfVectorizer(smooth_idf=True, analyzer='char', norm='l1') 

# tfidfでベクトル化
x_train = vectorizer.fit_transform(traintext)
x_test = vectorizer.transform(testtext)
x_dev = vectorizer.transform(devtext)

In [None]:
print("x_trainの形状：", x_train.shape)
print("x_devの形状：", x_dev.shape)
print("x_testの形状：", x_test.shape)

y_train =  np.array(list(map (int, train_label)))
y_dev =  np.array(list(map (int, dev_label)))
print("y_trainの形状：", y_train.shape)    
print("y_devの形状：", y_dev.shape)

x_trainの形状： (30000, 3231)
x_devの形状： (2500, 3231)
x_testの形状： (2500, 3231)
y_trainの形状： (30000,)
y_devの形状： (2500,)


#LightGBM

In [None]:
def lgb_custom_metric_qwk_regression(preds, data):
    # 正解ラベル
    y_true = data.get_label()
    # 予測ラベル
    y_pred = preds
    return 'qwk', cohen_kappa_score(y_true, reval(y_pred), weights='quadratic'), True

# https://blog.amedama.jp/entry/optuna-qwk-optimization

In [None]:
def reval(somearray):
    l = []
    for x in somearray:
        if x > 0.54:
            l.append(2)
        elif x > 0.3:
            l.append(1)
        elif x > -0.1:
            l.append(0)
        elif x > -0.55:
            l.append(-1)
        else:
            l.append(-2)
    return np.array(l)

In [None]:
# LightGBM
dtrain = lgb.Dataset(x_train, label=y_train)
ddev = lgb.Dataset(x_dev,label= y_dev)

# 使用するパラメータ
params = {'objective': 'regression',  # loss
        'metric': 'lgb_custom_metric_qwk_regression',  # 評価指標
        'random_state': 42, 
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'verbose': -1
        }
verbose_eval = 0

# 学習
gbm = lgb.train(params, dtrain,
                valid_sets=[ddev],  # 評価用データ
                feval=lgb_custom_metric_qwk_regression,
                num_boost_round=10000,
                callbacks=[lgb.early_stopping(stopping_rounds=32, verbose=True)] # early_stopping用コールバック関数
                )
# https://qiita.com/c60evaporator/items/2b7a2820d575e212bcf4

[1]	valid_0's qwk: 0
Training until validation scores don't improve for 32 rounds.
[2]	valid_0's qwk: 0
[3]	valid_0's qwk: 0
[4]	valid_0's qwk: 0.00386781
[5]	valid_0's qwk: 0.00899339
[6]	valid_0's qwk: 0.0373382
[7]	valid_0's qwk: 0.0751661
[8]	valid_0's qwk: 0.111016
[9]	valid_0's qwk: 0.134515
[10]	valid_0's qwk: 0.1418
[11]	valid_0's qwk: 0.162063
[12]	valid_0's qwk: 0.170557
[13]	valid_0's qwk: 0.178438
[14]	valid_0's qwk: 0.195752
[15]	valid_0's qwk: 0.212951
[16]	valid_0's qwk: 0.215474
[17]	valid_0's qwk: 0.235187
[18]	valid_0's qwk: 0.24651
[19]	valid_0's qwk: 0.252412
[20]	valid_0's qwk: 0.26201
[21]	valid_0's qwk: 0.272935
[22]	valid_0's qwk: 0.282903
[23]	valid_0's qwk: 0.283002
[24]	valid_0's qwk: 0.290406
[25]	valid_0's qwk: 0.299732
[26]	valid_0's qwk: 0.313316
[27]	valid_0's qwk: 0.318093
[28]	valid_0's qwk: 0.324893
[29]	valid_0's qwk: 0.32972
[30]	valid_0's qwk: 0.336592
[31]	valid_0's qwk: 0.345287
[32]	valid_0's qwk: 0.348425
[33]	valid_0's qwk: 0.356799
[34]	valid

In [None]:
# devに対するスコア算出
y_pred = gbm.predict(x_dev)
qwk = cohen_kappa_score(y_dev, reval(y_pred), weights='quadratic')
rmse = mean_squared_error(y_true=y_dev, y_pred=y_pred, squared=False)
print('QWK=', qwk)
print('RMSE=', rmse)

QWK= 0.46863744726737844
RMSE= 1.055380223555826


In [None]:
# testに対するスコア算出
y_pred = gbm.predict(x_test)
y_pred = reval(y_pred)

# 書き込み
f = open("LightGBM_顔文字あり_num_lr0.05_c_small_0.499.txt", "w")
for labeldata in y_pred:
    f.write(str(labeldata))
    f.write("\n")
f.close()

#CatBoost

In [None]:
#sudachiによる正規化を行う関数を定義
from sudachipy import Dictionary
from sudachipy import SplitMode
tokenizer = Dictionary(dict="full").create()

def sudachi(text):
    after = list()
    for token in tokenizer.tokenize(text, SplitMode.C):
        word = token.normalized_form() # 正規化あり
        pos = " ".join(token.part_of_speech())
        
        if word.isnumeric():
            word = '0'

        after.append(word)
        
    return after

In [None]:
train_tokenize_cat = [] 
dev_tokenize_cat = []
test_tokenize_cat = []

In [None]:
def tokenize(infile, outfile):
    for i in range(len(infile)):
        outfile.append(sudachi(neologdn.normalize(infile[i])))    #正規化あり

tokenize(train_text_demoji, train_tokenize_cat)
tokenize(dev_text_demoji, dev_tokenize_cat)
tokenize(test_text_demoji, test_tokenize_cat)

In [None]:
writefile(train_tokenize_cat, "train.txt")
writefile(dev_tokenize_cat, "dev.txt")
writefile(test_tokenize_cat, "test.txt")

In [None]:
with open("train.txt", 'r') as f:
    traintext = f.read().split("\n")
    traintext = traintext[:-1]
with open("test.txt", 'r') as f:
    testtext = f.read().split("\n")
    testtext = testtext[:-1]
with open("dev.txt", 'r') as f:
    devtext = f.read().split("\n")
    devtext = devtext[:-1]

vectorizer = TfidfVectorizer(smooth_idf=True, analyzer='char')

x_train_cat = vectorizer.fit_transform(traintext)
x_test_cat = vectorizer.transform(testtext)
x_dev_cat = vectorizer.transform(devtext)

In [None]:
print("x_trainの形状：", x_train_cat.shape)
print("x_devの形状：", x_dev_cat.shape)
print("x_testの形状：", x_test_cat.shape)

# y_train =  np.array(list(map (int, train_label)))
# y_dev =  np.array(list(map (int, dev_label)))
print("y_trainの形状：", y_train.shape)    
print("y_devの形状：", y_dev.shape)

x_trainの形状： (30000, 3246)
x_devの形状： (2500, 3246)
x_testの形状： (2500, 3246)
y_trainの形状： (30000,)
y_devの形状： (2500,)


In [None]:
def reval(somearray):
    l = []
    for x in somearray:
        if x > 0.6:
            l.append(2)
        elif x > 0.35:
            l.append(1)
        elif x > 0:
            l.append(0)
        elif x > -0.50:
            l.append(-1)
        else:
            l.append(-2)
    return np.array(l)

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import catboost as  cb
from catboost import CatBoost, Pool

# CatBoostを利用するのに必要なフォーマットに変換
cb_train = Pool(x_train, y_train)
cb_eval = Pool(x_dev, y_dev)
cb_test = Pool(x_test)

In [None]:
# パラメータ設定
params = {
    'num_boost_round': 1000,
    'early_stopping_rounds': 32,
}

# 学習
model = CatBoost(params)
model.fit(cb_train, eval_set=[cb_eval], verbose=True)

Learning rate set to 0.086857
0:	learn: 1.1875561	test: 1.1945366	best: 1.1945366 (0)	total: 214ms	remaining: 3m 34s
1:	learn: 1.1821393	test: 1.1891702	best: 1.1891702 (1)	total: 348ms	remaining: 2m 53s
2:	learn: 1.1767019	test: 1.1843562	best: 1.1843562 (2)	total: 493ms	remaining: 2m 43s
3:	learn: 1.1722048	test: 1.1808996	best: 1.1808996 (3)	total: 646ms	remaining: 2m 40s
4:	learn: 1.1675495	test: 1.1777665	best: 1.1777665 (4)	total: 779ms	remaining: 2m 35s
5:	learn: 1.1635396	test: 1.1737401	best: 1.1737401 (5)	total: 910ms	remaining: 2m 30s
6:	learn: 1.1598027	test: 1.1708826	best: 1.1708826 (6)	total: 1.04s	remaining: 2m 28s
7:	learn: 1.1562295	test: 1.1679287	best: 1.1679287 (7)	total: 1.18s	remaining: 2m 26s
8:	learn: 1.1534412	test: 1.1649651	best: 1.1649651 (8)	total: 1.32s	remaining: 2m 25s
9:	learn: 1.1502121	test: 1.1625431	best: 1.1625431 (9)	total: 1.46s	remaining: 2m 24s
10:	learn: 1.1471875	test: 1.1606263	best: 1.1606263 (10)	total: 1.6s	remaining: 2m 23s
11:	learn: 1

<catboost.core.CatBoost at 0x7f7a2488e820>

In [None]:
print(model.get_all_params())

{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'od_pval': 0, 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'eval_fraction': 0, 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 3, 'random_strength': 1, 'od_type': 'Iter', 'rsm': 1, 'boost_from_average': True, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': True, 'od_wait': 32, 'random_seed': 0, 'depth': 6, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'RMSE', 'learning_rate': 0.08685699850320816, 'score

In [None]:
# CatBoost推論
y_pred = model.predict(cb_test)
y_pred = reval(y_pred)

In [None]:
# 書き込み
f = open("CatBoost_顔文字あり_0.487.txt", "w")
for labeldata in y_pred:
    f.write(str(labeldata))
    f.write("\n")
f.close()

#SVR

In [None]:
#sudachiによる正規化を行う関数を定義
from sudachipy import Dictionary
from sudachipy import SplitMode
tokenizer = Dictionary(dict="full").create()

def sudachi(text):
    after = list()
    for token in tokenizer.tokenize(text, SplitMode.C):
        word = token.normalized_form() # 正規化あり
        pos = " ".join(token.part_of_speech())
        
        if word.isnumeric():
            word = '0'
            
        after.append(word)
    return after

In [None]:
# 単語分割
train_tokenize_svr = [] 
dev_tokenize_svr = []
test_tokenize_svr = []

tokenize(train_text_demoji, train_tokenize_svr)
tokenize(dev_text_demoji, dev_tokenize_svr)
tokenize(test_text_demoji, test_tokenize_svr)

In [None]:
writefile(train_tokenize_svr, "train.txt")
writefile(dev_tokenize_svr, "dev.txt")
writefile(test_tokenize_svr, "test.txt")

In [None]:
with open("train.txt", 'r') as f:
    traintext = f.read().split("\n")
    traintext = traintext[:-1]
with open("test.txt", 'r') as f:
    testtext = f.read().split("\n")
    testtext = testtext[:-1]
with open("dev.txt", 'r') as f:
    devtext = f.read().split("\n")
    devtext = devtext[:-1]

vectorizer = TfidfVectorizer(smooth_idf=True, analyzer='char', norm='l1') 

x_train = vectorizer.fit_transform(traintext)
x_test = vectorizer.transform(testtext)
x_dev = vectorizer.transform(devtext)

In [None]:
def reval(somearray):
    l = []
    for x in somearray:
        if x > 0.5:
            l.append(2)
        elif x > 0.2:
            l.append(1)
        elif x > -0.1:
            l.append(0)
        elif x > -0.5:
            l.append(-1)
        else:
            l.append(-2)
    return np.array(l)

In [None]:
from sklearn.metrics import cohen_kappa_score
best_qwk = 0
best_c = 1
for c in [1]:
    model = SVR(C=c, kernel='rbf')
    model.fit(x_train, y_train)
    y_pred = model.predict(x_dev)
    y_pred = reval(y_pred)

    qwk = cohen_kappa_score(y_dev, reval(y_pred), weights='quadratic')
    if qwk > best_qwk:
        best_qwk = qwk
        best_c = c
    print("QWK = %f  C = %s" % (qwk, str(c)))
print("最適なハイパーパラメタは C = %s" % str(best_c))

QWK = 0.423606  C = 1
最適なハイパーパラメタは C = 1




```
vectorizer = TfidfVectorizer()かつsudachi full (C)
正解率 = 0.396689  C = 0.1
正解率 = 0.424051  C = 0.3
正解率 = 0.427765  C = 0.35
正解率 = 0.429773  C = 0.39
正解率 = 0.432389  C = 0.4
正解率 = 0.431393  C = 0.41
正解率 = 0.428794  C = 0.45
正解率 = 0.427562  C = 0.5
正解率 = 0.426206  C = 0.55
正解率 = 0.421077  C = 0.8
正解率 = 0.423606  C = 1
正解率 = 0.411558  C = 3
正解率 = 0.406475  C = 5
正解率 = 0.404217  C = 8
正解率 = 0.399755  C = 10
```



In [None]:
model = SVR(C=best_c, kernel='rbf')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
def reval2(somearray):
    l = []
    for x in somearray:
        if x > 0.8:
            l.append(2)
        elif x > 0.4:
            l.append(1)
        elif x > -0.15:
            l.append(0)
        elif x > -0.52:  #or 0.56
            l.append(-1)
        else:
            l.append(-2)
    return np.array(l)

In [None]:
evalpath = "SVR_c0.4_0.490.txt"
#回帰
y_pred_ = reval2(y_pred)
# 書き込み
f = open(evalpath, "w")
for labeldata in y_pred_:
    f.write(str(labeldata))
    f.write("\n")
f.close()

#アンサンブル

In [None]:
y1 = load_data("SVR_c0.4_0.490.txt")
y2 = load_data("CatBoost_顔文字あり_0.487.txt")
y3 = load_data("LightGBM_顔文字あり_num_lr0.05_c_small_0.499.txt")

In [None]:
def ensreval(x):
    if x >= 1.78:
        num = 2
    elif x >= 0.6: 
        num = 1
    elif x >= -0.44:
        num = 0
    elif x >= -1:
        num = -1
    else:
        num = -2
    return num

In [None]:
y1 = [int(x) for x in y1]
y2 = [int(x) for x in y2]
y3 = [int(x) for x in y3]

ens = []
for i in range(len(y1)):
    label = y1[i]*0.22 + y2[i]*0.27 + y3[i]*0.51
    print(label)
    ens.append(ensreval(label))

# アンサンブル結果 書き込み
f = open("ensemble_eval.txt", "w")
for labeldata in ens:
    f.write(str(int(labeldata)))
    f.write("\n")
f.close()

1.24
0.98
-0.27
0.98
2.0
0.71
-0.22
0.0
0.51
1.49
0.24
1.49
0.78
0.98
-0.22
2.0
2.0
2.0
1.78
0.24
0.24
0.53
1.24
-0.22
1.02
-0.47
-0.78
1.49
0.030000000000000027
-1.0
-0.49
0.73
1.02
-0.27
-0.27
2.0
-0.51
-0.24
0.0
0.24
0.78
0.020000000000000018
0.47
2.0
1.78
0.51
1.0
0.78
-1.49
-0.98
1.73
0.0
-1.27
-0.98
1.27
-1.49
1.56
-0.49
-0.27
1.51
0.020000000000000018
0.0
0.51
0.0
0.98
0.98
-1.0
1.24
0.27
-0.49
0.98
0.24
0.76
-0.27
-0.27
-0.49
1.49
2.0
0.0
1.51
-1.49
1.49
-1.73
-0.05000000000000002
1.49
-0.49
-1.0
-1.0
1.51
0.0
0.73
-2.0
0.0
0.0
0.49
-0.27
0.0
0.49
-0.71
0.020000000000000018
2.0
-0.020000000000000018
0.27
0.49
-0.27
-1.0
1.56
-0.78
-0.25
0.0
-0.49
-0.49
2.0
-1.0
1.78
0.98
1.05
0.78
1.78
-0.51
2.0
-1.49
0.49
-1.49
1.78
1.24
-0.49
2.0
-0.22
0.51
-0.27
-0.73
1.27
0.030000000000000027
-1.27
-0.71
0.51
1.02
-1.49
1.49
1.49
0.78
0.24
-0.98
0.24
2.0
0.98
1.49
-0.49
-1.0
1.49
0.24
-0.47
1.29
0.49
-0.98
0.29000000000000004
0.51
0.24
0.020000000000000018
-0.51
-0.22
-0.25
1.27
-0.98
0.51
