In [19]:
import os

#### file directory for Bert model specific
```
├── BERT_method
│   ├── bert                      <- very thing Bert related is stored in this folder.
|       |                             Download and save the pre-trained model from official BERT Github page:
|       |                             https://github.com/google-research/bert
|       ├── data                  <- Make sure all the .tsv files are in a folder named “data”
│       ├── bert_output           <- create the folder “bert_output” where the fine tuned model will
|       |                            be saved and test results are generated under the name “test_results.tsv“
│       └── cased_L-12_H-768_A-12 <- unzip the downloaded pre-trained BERT model in the directory 
│
├── notebooks      
|...
```

In [20]:
# check current working directory
os.getcwd() 

'C:\\Users\\fanfan\\Documents\\Capstone\\DSCI_591_capstone-BCStats\\BERT_method'

In [22]:
# # change the working directory to the bert directory
os.chdir('C:\\Users\\fanfan\\Documents\\Capstone\\DSCI_591_capstone-BCStats\\BERT_method\\bert')
os.getcwd()

'C:\\Users\\fanfan\\Documents\\Capstone\\DSCI_591_capstone-BCStats\\BERT_method\\bert'

In [4]:
# load packages
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [5]:
# read in 2018 qualitative data
df_2018 = pd.read_csv("../data/interim/train_2018-qualitative-data.csv")
df_2018.head()
#df_2018['Unnamed: 0']

Unnamed: 0.1,Unnamed: 0,_telkey,2018 Comment,Code 1,Code 2,Code 3,Code 4,Code 5,CPD,CB,...,VMG_Improve_collaboration,VMG_Improve_program_implementation,VMG_Public_interest_and_service_delivery,VMG_Review_funding_or_budget,VMG_Keep_politics_out_of_work,VMG_other,OTH_Other_related,OTH_Positive_comments,OTH_Survey_feedback,Unrelated
0,0,192723-544650,I would suggest having a developmental growth ...,62,13.0,,,,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,188281-540434,Base decisions regarding fish and wildlife on ...,116,,,,,0,0,...,0,0,0,0,1,0,0,0,0,0
2,4,174789-230694,Get rid of Leading Workplace Strategies and gi...,51,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7,185914-180608,We are the lowest paid in Canada with a worklo...,24,62.0,,,,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8,189099-732978,Official acknowledgement of the limited divers...,35,62.0,,,,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
categories = df_2018.loc[:,'CPD':'OTH'].columns.tolist()
categories

['CPD',
 'CB',
 'EWC',
 'Exec',
 'FWE',
 'SP',
 'RE',
 'Sup',
 'SW',
 'TEPE',
 'VMG',
 'OTH']

In [7]:
# Creating train and dev dataframes according to BERT
df_bert = pd.DataFrame({'user_id':df_2018['_telkey'],
            'label':df_2018['CPD'],
            'alpha':['a']*df_2018.shape[0],
            'text':df_2018['2018 Comment'].replace(r'\n',' ',regex=True)})
df_bert.head()

Unnamed: 0,user_id,label,alpha,text
0,192723-544650,1,a,I would suggest having a developmental growth ...
1,188281-540434,0,a,Base decisions regarding fish and wildlife on ...
2,174789-230694,0,a,Get rid of Leading Workplace Strategies and gi...
3,185914-180608,0,a,We are the lowest paid in Canada with a worklo...
4,189099-732978,0,a,Official acknowledgement of the limited divers...


In [8]:
df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.25,random_state=2019)

In [9]:
# Creating test dataframe according to BERT
df_test = pd.read_csv("../data/interim/test_2018-qualitative-data.csv")
df_bert_test = pd.DataFrame({'User_ID':df_test['_telkey'],
                 'text':df_test['2018 Comment'].replace(r'\n',' ',regex=True)})
df_bert_test.head() 

Unnamed: 0,User_ID,text
0,194791-949508,The compensation.
1,174648-027372,compare type of work; expertise required; and ...
2,176038-900440,Greater support for mobile work options and in...
3,173698-669014,Consistent direction by all Supervisors.
4,175136-609856,"Sound - working in an open area, it can be; ve..."


In [12]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv('./bert/data/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('./bert/data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('./bert/data/test.tsv', sep='\t', index=False, header=True)

In [23]:
#import bert

In [25]:
#from bert import run_classifier

### Run the below command on terminal:
```
python run_classifier.py 
--task_name=cola 
--do_train=true 
--do_eval=true 
--do_predict=true 
--data_dir=./data/ 
--vocab_file=./cased_L-12_H-768_A-12/vocab.txt 
--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json 
--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt 
--max_seq_length=400 
--train_batch_size=8 
--learning_rate=2e-5 
--num_train_epochs=3.0 
--output_dir=./bert_output/ 
--do_lower_case=False
```