In [1]:
import pandas as pd 
import os 
import json
import pickle
from sklearn.utils import shuffle 
from sklearn.model_selection import train_test_split

In [2]:
default_path = os.getcwd()
data_path = os.path.join(default_path, 'data')
base_model = os.path.join(default_path, 'base-model')
config_path = os.path.join(default_path, 'config')
log_path = os.path.join(default_path, 'log')
config_file = "bert-base.json"

#### bws data

In [7]:
bws_score = pd.read_csv(os.path.join(data_path, 'bws_sim_score.csv'))
bws_score.head(3)

Unnamed: 0,text,translated,cnt,weakest_cnt,strongest_cnt,score,minmax_score
0,i am sad for both of them and i hope something...,나는 그들 모두에게 슬프고 나는 무언가가 더 나은 방향으로 바뀌기를 바란다,8,0,3,0.375,0.6875
1,it feels like all i ever achieve through what ...,불행함을 느끼지 않기 위해 내가 힘든 노력으로 느끼는 것을 통해 내가 성취하는 모든...,8,0,0,0.0,0.5
2,so yet again i lose the person that i tell eve...,그래서 다시 나는 내가 모든 것을 말할 수 있는 사람을 잃고 우울할 때 기분이 나아...,8,0,0,0.0,0.5


In [8]:
bws_score = bws_score[['text', 'translated', 'minmax_score']]
bws_score.columns = ['text', 'text_kor', 'label']
bws_score.head(1)

Unnamed: 0,text,text_kor,label
0,i am sad for both of them and i hope something...,나는 그들 모두에게 슬프고 나는 무언가가 더 나은 방향으로 바뀌기를 바란다,0.6875


In [10]:
bws_score['label'] = bws_score['label'] * 16
bws_score['label'] = bws_score.label.apply(lambda x: int(x))

In [11]:
X_train, X_test = train_test_split(bws_score, test_size=0.2, random_state=42)
X_train, X_dev = train_test_split(X_train, test_size=0.1, random_state=42)
len(X_train), len(X_dev), len(X_test)

(1152, 128, 320)

In [12]:
X_train.to_csv(os.path.join(data_path, 'bws_score_train.csv'), index=False)
X_dev.to_csv(os.path.join(data_path, 'bws_score_val.csv'), index=False)
X_test.to_csv(os.path.join(data_path, 'bws_score_test.csv'), index=False)

#### BWS binary data

In [28]:
bws_dep = pd.read_csv(os.path.join(data_path, 'bws_sim_dep.csv'))
bws_dep['label'] = 0

In [29]:
bws_ndep = pd.read_csv(os.path.join(data_path, 'bws_sim_daily.csv'))
bws_ndep['label'] = 1

In [31]:
bws_bin = pd.concat([bws_dep, bws_ndep])
bws_bin.reset_index(inplace=True, drop=True)
bws_bin.head(3)

Unnamed: 0,text,label
0,i am sad for both of them and i hope something...,0
1,it feels like all i ever achieve through what ...,0
2,so yet again i lose the person that i tell eve...,0


In [32]:
bws_bin.columns = ['text', 'label']
bws_bin.head(1)

Unnamed: 0,text,label
0,i am sad for both of them and i hope something...,0


In [33]:
bws_bin

Unnamed: 0,text,label
0,i am sad for both of them and i hope something...,0
1,it feels like all i ever achieve through what ...,0
2,so yet again i lose the person that i tell eve...,0
3,i need to realize i am unhappy for no reason,0
4,i don't feel like i've done anything to make a...,0
...,...,...
1595,the thing is that i don't feel sad or guilty a...,1
1596,i can almost pinpoint when it started but late...,1
1597,if you're really unsure then maybe you could t...,1
1598,i don't feel sad or happy sometimes angry at t...,1


In [34]:
X_train, X_test = train_test_split(bws_bin, test_size=0.2, random_state=42)
X_train, X_dev = train_test_split(X_train, test_size=0.1, random_state=42)
len(X_train), len(X_dev), len(X_test)

(1152, 128, 320)

In [35]:
X_train.to_csv(os.path.join(data_path, 'bws_bin_train.csv'), index=False)
X_dev.to_csv(os.path.join(data_path, 'bws_bin_val.csv'), index=False)
X_test.to_csv(os.path.join(data_path, 'bws_bin_test.csv'), index=False)

#### DSM-5 data

In [20]:
dsm_data = pd.read_csv(os.path.join(data_path, 'dsm_data.csv'))
dsm_data.groupby('label').count()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0_level_0,id,text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,87891,87891
1,2078,2078
2,5129,5129
3,7228,7228
4,716,716
5,1741,1741
6,41234,41234
7,2452,2452
8,52860,52860
9,98746,98746


In [21]:
dsm_sample = dsm_data.copy()
dsm = []

# sample2: 40,000, 20,000, 20,000
dsm.extend(dsm_sample[dsm_sample.label==0].sample(25000).index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==1].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==2].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==3].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==4].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==5].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==6].sample(15000).index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==7].index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==8].sample(15000).index.tolist())
dsm.extend(dsm_sample[dsm_sample.label==9].sample(25000).index.tolist())

len(dsm)

99344

In [22]:
dsm_sample = dsm_sample.loc[dsm]
dsm_sample.reset_index(inplace=True, drop=True)
dsm_sample.to_csv(os.path.join(data_path, 'dsm_sample.csv'), index=False)

In [23]:
X_train, X_test = train_test_split(dsm_sample, test_size=0.2, random_state=42, stratify=dsm_sample['label'])
X_train, X_dev = train_test_split(X_train, test_size=0.2, random_state=42, stratify=X_train['label'])
len(X_train), len(X_dev), len(X_test)

(63580, 15895, 19869)

In [26]:
X_train.to_csv(os.path.join(data_path, 'dsm_samp_train.csv'), index=False)
X_dev.to_csv(os.path.join(data_path, 'dsm_samp_val.csv'), index=False)
X_test.to_csv(os.path.join(data_path, 'dsm_samp_test.csv'), index=False)