#新調整 (相較0528)
* 前處理不做停用詞，只做正則表達式
* 下一步想法: 字型還原、傳統轉句向量方法(因BERT-as-*Service是有考慮語境的，可能要用傳統的word2vec*)

#結果![alt text](./content/0530結果.png)

# 流程
1. NLTK:
    * 正則表達式
    * 略_Word Segmentation(斷詞) → 一般都是用來抓關鍵字
    * 略_Pos Tagging(詞類自動標記)
    * 略_Parsing (句法剖析)
    * 略_Lemmatization (字型還原)
    * 新: 略_Stopword (停用詞)
    * 略_NER (命名實體)
2. BERT-as-Service:
    * 轉句向量
3. XGBoost Classifier

# Note
1. train& test data都要做一樣的處理

In [0]:
import tensorflow as tf
print(tf.__version__)

2.2.0


In [0]:
# 確認用GPU跑
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## 下一欄可能不用

In [0]:
# 幫助安裝一些需要的package和開啟和Google Drive連動的權限
# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
# !apt-get update -qq 2>&1 > /dev/null
# !apt-get -y install -qq google-drive-ocamlfuse fuse
# from google.colab import auth
# auth.authenticate_user()
# from oauth2client.client import GoogleCredentials
# creds = GoogleCredentials.get_application_default()
# import getpass
# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
# vcode = getpass.getpass()
# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

## 1.載入套件

In [0]:
!pip install transformers
# 有安裝過就可



In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as K
import seaborn as sns
import transformers
import nltk
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

## 2.讀入數據

### 下一欄，有上傳檔案成功即可(左邊看得到)，不用重複跑

In [0]:
# 上傳/下載至Google Drive
# 以下兩段程式碼功能分別為上傳和下載檔案。
# 第一段程式碼執行後會在該cell輸出中顯示按鈕，就可以從本機上選擇檔案，上傳後是傳到虛擬機中。

# from google.colab import files
# uploaded = files.upload()
# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

In [0]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [0]:
df_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [0]:
df_train['text'].head(3)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
Name: text, dtype: object

In [0]:
len(df_train['text'])

7613

# 3.文字探勘
用預處理的技術
* 文字雲
* 詞頻
* 相關應用http://www.taiwanjapanese.url.tw/2019/AI03.pdf

## 4.資料清洗
* 正則表達式

### 正則表達式

In [0]:
def clean(text):
    # 移除顏文字和一些怪符號
    reg = re.compile('\\.+?(?=\B|$)')
    text = text.apply(lambda r: re.sub(reg, string = r, repl = ''))
    reg = re.compile('\x89Û_')
    text = text.apply(lambda r: re.sub(reg, string = r, repl = ' '))
    reg = re.compile('\&amp')
    text = text.apply(lambda r: re.sub(reg, string = r, repl = '&'))
    reg = re.compile('\\n')
    text = text.apply(lambda r: re.sub(reg, string = r, repl = ' '))
    # 移除 hashtag 的符號'#'
    text = text.apply(lambda r: r.replace('#', ''))
    # 移除人名標記
    reg = re.compile('@\w+')#\w 匹配字母或数字、英文字母或汉字
    text = text.apply(lambda r: re.sub(reg, string = r, repl = '@'))
    # 移除網址
    reg = re.compile('https?\S+(?=\s|$)')
    text = text.apply(lambda r: re.sub(reg, string = r, repl = 'www'))
    # 全部小寫化
    text = text.apply(lambda r: r.lower())
    return text

In [0]:
# for i in range(len(df_train['text'])):
#     df_train['text'][i] = clean(df_train['text'][i])
# 上面有 ERROR~AttributeError: 'str' object has no attribute 'apply'
    
df_train['text'] = clean(df_train['text'])
df_test['text'] = clean(df_test['text'])

In [0]:
df_train['text'].head(3)

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to 'shelter in place' are ...
Name: text, dtype: object

In [0]:
df_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1


In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 斷詞法二

In [0]:
# 示範 NLTK兩種斷詞方式
testStr = "This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts."
tokens = nltk.word_tokenize(testStr)
print(tokens)
tokens = nltk.wordpunct_tokenize(testStr) ## 請注意，差異在cut-off
print(tokens)

#第一種方法比較符合需求(較合理)→　所以採用此

['This', 'value', 'is', 'also', 'called', 'cut-off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.']
['This', 'value', 'is', 'also', 'called', 'cut', '-', 'off', 'in', 'the', 'literature', '.', 'If', 'float', ',', 'the', 'parameter', 'represents', 'a', 'proportion', 'of', 'documents', ',', 'integer', 'absolute', 'counts', '.']


In [0]:
# 實做我們資料
# 教學: https://ithelp.ithome.com.tw/articles/10191922
tokens_NCCU = []
for i in range(len(df_train['text'])):
    #print(df_train['text'][i])
    content = df_train['text'][i]
    tokens = nltk.word_tokenize(content)
    #print(tokens)
    tokens_NCCU.append(tokens)

# df_test
tokens_NCCU_test = []
for i in range(len(df_test['text'])):
    #print(df_test['text'][i])
    content = df_test['text'][i]
    tokens = nltk.word_tokenize(content)
    #print(tokens)
    tokens_NCCU_test.append(tokens)

In [0]:
tokens_NCCU[2]

['all',
 'residents',
 'asked',
 'to',
 "'shelter",
 'in',
 'place',
 "'",
 'are',
 'being',
 'notified',
 'by',
 'officers',
 'no',
 'other',
 'evacuation',
 'or',
 'shelter',
 'in',
 'place',
 'orders',
 'are',
 'expected']

In [0]:
tokens_NCCU_test[2]

['there',
 'is',
 'a',
 'forest',
 'fire',
 'at',
 'spot',
 'pond',
 ',',
 'geese',
 'are',
 'fleeing',
 'across',
 'the',
 'street',
 ',',
 'i',
 'can',
 'not',
 'save',
 'them',
 'all']

In [0]:
len(tokens_NCCU[2])

23

## 文字雲呈現

# Data Saving: tokens_NCCU, tokens_NCCU_test
list，由string所組成

### 去停用字

In [0]:
#下載停用字典
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# from nltk.corpus import stopwords
# stopword = stopwords.words('english')

# # 擴充停用字表
# for w in ['!',',','.','?','-s','-ly','</s>','s', '#', ':']:
#     stopword.append(w)
# #print(stopword)

In [0]:
# 移除停用字
# tokens_NCCU_afterstop = []
# for i in range(len(tokens_NCCU)):
#   for j in tokens_NCCU[i]:
#     #print(j) 每個詞
#     if j in stopword:
#       tokens_NCCU[i].remove(j)
#   tokens_NCCU_afterstop.append(tokens_NCCU[i])


# # tokens_NCCU_test
# tokens_NCCU_test_afterstop = []
# for i in range(len(tokens_NCCU_test)):
#   for j in tokens_NCCU_test[i]:
#   #print(j) 每個詞
#     if j in stopword:
#       tokens_NCCU_test[i].remove(j)
#   tokens_NCCU_test_afterstop.append(tokens_NCCU_test[i])


## 下面有ERROR，兩個數量應該要不同

In [0]:
# len(tokens_NCCU_afterstop[2])
# print("確認有去除掉停用詞，原本句子單詞數量{}，去停用字後的單字數量{}".format(len(tokens_NCCU[2]), len(tokens_NCCU_afterstop[2])))

# Data Saving: tokens_NCCU, tokens_NCCU_test
list，由一個個單詞的string所組成

## 把斷詞後的list，變回一個句子(不用care句意): list > string

In [0]:
# 原本做停用詞處理

# tokens_NCCU_set = []
# for i in range(len(tokens_NCCU_afterstop)):
#   #print(tokens_NCCU_afterstop[i])
#   #print(" ".join(tokens_NCCU_afterstop[i]))
#   a = " ".join(tokens_NCCU_afterstop[i])
#   tokens_NCCU_set.append(a)

# # test，來源　tokens_NCCU_test_afterstop
# tokens_NCCU_test_set = []
# for i in range(len(tokens_NCCU_test_afterstop)):
#   #print(tokens_NCCU_test_afterstop[i])
#   #print(" ".join(tokens_NCCU_test_afterstop[i]))
#   a = " ".join(tokens_NCCU_test_afterstop[i])
#   tokens_NCCU_test_set.append(a)

In [0]:
tokens_NCCU_set = []
for i in range(len(tokens_NCCU)):
  #print(tokens_NCCU[i])
  #print(" ".join(tokens_NCCU[i]))
  a = " ".join(tokens_NCCU[i])
  tokens_NCCU_set.append(a)

# test，來源　tokens_NCCU_test
tokens_NCCU_test_set = []
for i in range(len(tokens_NCCU_test)):
  #print(tokens_NCCU_test[i])
  #print(" ".join(tokens_NCCU_test[i]))
  a = " ".join(tokens_NCCU_test[i])
  tokens_NCCU_test_set.append(a)

In [0]:
tokens_NCCU_set[2]

"all residents asked to 'shelter in place ' are being notified by officers no other evacuation or shelter in place orders are expected"

### 前處理後的資料存在: tokens_NCCU_set, tokens_NCCU_test_set
型態為list，裡面是一個個句子(string)




## 5.轉句向量
* Bert-as-Service
* word2vec

### 步驟
* 安裝bert-as-service→ 啟動服務→ 測試服務→ (先繁轉簡在做，會較精確)
* 要求: Python >= 3.5 with Tensorflow >= 1.10(2.0不可行!!), bert-as-service


### 安裝 tf1.14

In [0]:
!pip install tensorflow==1.14



In [0]:
# 測試版本（內建原先版本: 2.2.0)
import tensorflow as tf
print(tf.__version__)

2.2.0


### 安裝Bert-as-Service

In [0]:
!pip install bert-serving-server
!pip install bert-serving-client



In [0]:
# 下載bert的英文模型(簡體訓練的)
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

--2020-05-30 07:25:34--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.71.128, 2a00:1450:400c:c01::80
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.71.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip.1’


2020-05-30 07:25:37 (155 MB/s) - ‘uncased_L-12_H-768_A-12.zip.1’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
replace uncased_L-12_H-768_A-12/bert_model.ckpt.meta? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_model.ckpt.index? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace uncased_L-12_H-768_A-12/bert_config.json? [

### 啟動服務
教學文


1.   https://www.gitmemory.com/issue/hanxiao/bert-as-service/380/560435899
2.   https://github.com/hanxiao/bert-as-service/issues/380



## 轉句向量

In [0]:
# try
!nohup bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 > out.file 2>&1 -max_seq_len=NONE &

from bert_serving.client import BertClient
bc = BertClient()
print (bc.encode(['First do it', 'then do it right', 'then do it better']))

# 法二
from bert_serving.client import BertClient
bc = BertClient()
doc_vecs = bc.encode(['First do it', 'then do it right', 'then do it better'])
doc_vecs



[[ 0.13186485  0.32404143 -0.82704383 ... -0.37119576 -0.39250165
  -0.3172185 ]
 [ 0.24873513 -0.12334436 -0.38933846 ... -0.447562   -0.55913544
  -0.11345179]
 [ 0.28627336 -0.18580157 -0.3090683  ... -0.29593667 -0.39310506
   0.07640215]]


array([[ 0.13186485,  0.32404143, -0.82704383, ..., -0.37119576,
        -0.39250165, -0.3172185 ],
       [ 0.24873513, -0.12334436, -0.38933846, ..., -0.447562  ,
        -0.55913544, -0.11345179],
       [ 0.28627336, -0.18580157, -0.3090683 , ..., -0.29593667,
        -0.39310506,  0.07640215]], dtype=float32)

### 跑我們的資料

In [0]:
try_list = []
try_list.append(tokens_NCCU_set[1])

#try_list

### 調整**max_seq_len** (Server API的參數)
maximum length of sequence, longer sequence will be trimmed on the right side. Set it to NONE for dynamically using the longest sequence in a (mini)batch.
https://github.com/hanxiao/bert-as-service


# -max_seq_len=NONE   不要用此，會跑到掛~~
# 改用 -max_seq_len= 200 → 400

## 正式跑句向量啦，存成 doc_vecs

In [0]:
!nohup bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 > out.file 2>&1 -max_seq_len=400 &

from bert_serving.client import BertClient
bc = BertClient()
doc_vecs = bc.encode(tokens_NCCU_set)
#doc_vecs



In [0]:
doc_vecs[1]

array([-8.37910846e-02, -7.49886334e-02, -3.50117832e-02, -2.91408729e-02,
        7.65842378e-01, -2.87871603e-02,  2.10944071e-01,  5.60735404e-01,
       -5.08719862e-01, -7.86611214e-02,  1.55020887e-02, -2.40075767e-01,
        3.03001590e-02,  5.05166471e-01, -7.71939099e-01,  7.69955039e-01,
       -1.89149708e-01,  4.58489694e-02,  4.08051282e-01, -1.84479281e-01,
       -4.26469415e-01, -4.56380874e-01,  2.11968228e-01,  5.46523809e-01,
        8.13089684e-02,  3.53223503e-01,  6.10379204e-02,  2.67759174e-01,
        2.37048455e-02,  4.67865497e-01, -1.19265564e-01,  1.35602027e-01,
        6.67776644e-01, -6.82074845e-01,  2.53042560e-02, -4.04649258e-01,
        2.76614755e-01,  2.50135094e-01, -9.50284898e-02, -2.32418664e-02,
       -1.62865147e-01, -4.41355795e-01,  7.17646256e-02,  4.56245802e-02,
        7.99047649e-01,  4.21877772e-01, -1.06708266e-01, -1.17437385e-01,
        2.88268514e-02, -3.24244052e-01, -3.56220901e-01, -2.19835863e-01,
       -1.88626528e-01, -

In [0]:
!pip install tqdm 



In [0]:
# from tqdm import tqdm 
# from time import sleep
# for i in tqdm(range(10000)): 
#   sleep(0.01) 

In [0]:
!nohup bert-serving-start -model_dir ./uncased_L-12_H-768_A-12 > out.file 2>&1 -max_seq_len=400 &

from bert_serving.client import BertClient
bc = BertClient()
doc_test_vecs = bc.encode(tokens_NCCU_test_set)
#doc_test_vecs



In [0]:
doc_vecs[1].shape

(768,)

In [0]:
doc_vecs[1].size

768

### 句向量存成 dataframe格式，當作 train data的context → 轉成XGBoost可以吃的格式 

## (法二)

In [0]:
df_train['target'].head(3)

0    1
1    1
2    1
Name: target, dtype: int64

In [0]:
#x_train_weight = tf_idf.toarray()  # 训练集TF-IDF权重矩阵 
x_train_weight = doc_vecs
y_train = df_train['target']
x_test_weight = doc_test_vecs

In [0]:
import pandas as pd
import xgboost as xgb

In [0]:
# 參考
#将数据转换为DMatrix类型
#XGBoost 的二进制的缓存文件，加载的数据存储在对象 DMatrix 中。
# 将数据转化为DMatrix类型    

dtrain = xgb.DMatrix(x_train_weight, label=y_train)    
#dtest = xgb.DMatrix(x_test_weight, label=y_test)
dtest = xgb.DMatrix(x_test_weight)

In [0]:
dtrain

<xgboost.core.DMatrix at 0x7f8c9cd690f0>

### 建XGBoost模型
教學文: https://blog.csdn.net/asialee_bird/article/details/94836962

# 先回頭以一樣方式處理 x_test_weight (test data)

In [0]:
#基于XGBoost原生接口的分类
 #xgboost模型构建
# param = {'silent': 0, 'eta': 0.3, 'max_depth': 6, 'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': 'merror'}  # 参数
# evallist = [(dtrain, 'train'), (dtest, 'test')]
# num_round = 100  # 循环次数
# xgb_model = xgb.train(param, dtrain, num_round,evallist)

# 保存训练模型
# xgb_model.save_model('data/xgb_model')
# xgb_model=xgb.Booster(model_file='data/xgb_model') #加载训练好的xgboost模型

In [0]:
#基于Scikit-learn接口的分类
# 训练模型
#xgb_param['num_class'] = 2
model = xgb.XGBClassifier(num_class=2, max_depth=6, learning_rate=0.1, n_estimators=100, silent=True, objective='multi:softmax')
model.fit(x_train_weight, y_train)
y_predict=model.predict(x_test_weight)

In [0]:
y_predict

import pandas as pd
pred_df_XGBoost = pd.DataFrame(y_predict)

In [0]:
pred_df_XGBoost

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
3258,1
3259,1
3260,1
3261,1


In [0]:
df_test = pd.read_csv("test.csv")
df_test.head(3) 

df_test['target'] = pred_df_XGBoost
df_test.head(3)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1


In [0]:
pred_df_XGBoost[0].value_counts()
# pred_df_XGBoost.value_counts()

0    2100
1    1163
Name: 0, dtype: int64

In [0]:
df_test[['id', 'target']].to_csv('0530_submission_XGBoost_colab.csv', index=False)

In [0]:
!pwd
# 目前工作位置

/content


In [0]:
model.get_booster().save_model('0530_submission_XGBoost_colab.model')  

In [0]:
# 儲存權重

model.save_weights('./submission_XGBoost_colab0530.h5', overwrite=True)
new_model = make_model(batch_size=None)
new_model.load_weights('./submission_XGBoost_colab0530.h5')
new_model.summary()
#new_model.evaluate(x_test, y_test)

AttributeError: ignored