<a href="https://colab.research.google.com/gist/MarkCreightonQueens/51f9756a98654c7acdb907d91c055c65/machine-learning-training-bert-0_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random as r
r.seed(0)

In [None]:
#installing tools
!pip install transformers 
!pip install wget
!pip install tika
!pip install ijson

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 6.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 16.4MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 37.2MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K   

In [None]:
#Importing tools
import pandas as pd
import numpy as np
from tika import parser
%tensorflow_version 1.x
import tensorflow as tf
import torch
import os
import shutil
import glob
import nltk
nltk.download('punkt')
import wget
import urllib.request
import re
import gzip
from nltk.tokenize import sent_tokenize
import json

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

TensorFlow 1.x selected.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at:{}'.format(device_name))
else:
  raise SystemError('GPU device not found')

Found GPU at:/device:GPU:0


In [None]:
#Determining what GPU is available for use
if torch.cuda.is_available():
  device = torch.device("cuda")
  print ('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else: 
  print('No GPU available, using CPU instead.')
  device = torch.device("CPU")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
#Download this to drive instead
!git clone https://github.com/google-research/bert.git

Cloning into 'bert'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 317.20 KiB | 1.00 MiB/s, done.
Resolving deltas: 100% (185/185), done.


In [None]:
shutil.unpack_archive('/content/drive/My Drive/Thesis/yelp_review_full_csv.tgz')

In [None]:
#Reading in the CSV. 

data_train_full = pd.read_csv('/content/yelp_review_full_csv/train.csv', header = None)
data_test = pd.read_csv('/content/yelp_review_full_csv/test.csv', header = None)

print(data_train_full.shape)
print(data_test.shape)

#Creating a smaller subset of the training dataset due to computational restrictions
data_train_1 = data_train_full[data_train_full[0] == 1].sample(50000, random_state=0)
data_train_2 = data_train_full[data_train_full[0] == 2].sample(50000, random_state=0)
data_train_3 = data_train_full[data_train_full[0] == 3].sample(50000, random_state=0)
data_train_4 = data_train_full[data_train_full[0] == 4].sample(50000, random_state=0)
data_train_5 = data_train_full[data_train_full[0] == 5].sample(50000, random_state=0)
frames = [data_train_1, data_train_2, data_train_3, data_train_4, data_train_5]
data_train = pd.concat(frames)
#randomizing the order of the observations
data_train = data_train.sample(frac=1, random_state=0)

#rescaling the ratings
data_train[0] = (data_train[0] - 1)
data_test[0] = (data_test[0] - 1)

print(data_train[0].unique())
print(data_train.shape)

(650000, 2)
(50000, 2)
[2 1 3 0 4]
(250000, 2)


In [None]:
df_bert = pd.DataFrame({'id':range(len(data_train)),'label':data_train[0],'alpha':['a']*data_train.shape[0],'text':data_train[1].replace(r'\n', ' ', regex = True)})
print(df_bert.head())

df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)

        id  label alpha                                               text
116433   0      2     a  The employees/students at EMCC are very friend...
308806   1      1     a  My kids and I went here to grab some take out ...
464848   2      3     a  This place is a 21 and up club that caters to ...
457220   3      1     a  This place is just NOT what it used to be.  Iv...
64604    4      0     a  Despite apprehensions of the 1st review here I...


In [None]:
df_bert_test = pd.DataFrame({'id':range(len(data_test)), 'text': data_test[1].replace(r'\n', ' ', regex=True)})

df_bert_test.tail()

Unnamed: 0,id,text
49995,49995,Just wanted to write a review to chip in with ...
49996,49996,Great ambience. Great drinks. Great food. I lo...
49997,49997,I have been to the other Monks locations so I ...
49998,49998,Don't go here. I know you might want to try i...
49999,49999,Buffet was recently open after renovation so m...


In [None]:
#Saving the dataframes to the required format. 

df_bert_train.to_csv('/content/drive/My Drive/Thesis/BERT/bert_data/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('/content/drive/My Drive/Thesis/BERT/bert_data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('/content/drive/My Drive/Thesis/BERT/bert_data/test.tsv', sep='\t', index=False, header=True)

In [None]:
#training the mode
os.chdir('/content/drive/My Drive/Thesis/BERT/bert-master') 
#!python run_classifier_0_4.py --task_name=cola --do_train=true --do_eval=true --data_dir='/content/drive/My Drive/Thesis/BERT/bert_data' --vocab_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/vocab.txt' --bert_config_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/bert_config.json' --init_checkpoint='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/bert_model.ckpt' --max_seq_length=128 --train_batch_size=32 --learning_rate=2e-5 --num_train_epochs=3.0 --output_dir='/content/drive/My Drive/Thesis/BERT/0_4/bert_output' --do_lower_case=True  --save_checkpoints_steps 16000

In [None]:
#predicting sentiment
os.chdir('/content/drive/My Drive/Thesis/BERT/bert-master')
#!python run_classifier_0_4.py --task_name=cola --do_predict=true --data_dir='/content/drive/My Drive/Thesis/BERT/bert_data' --vocab_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/vocab.txt' --bert_config_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/bert_config.json' --init_checkpoint='/content/drive/My Drive/Thesis/BERT/0_4/bert_output/model.ckpt-23203' - --max_seq_length=128 --output_dir='/content/drive/My Drive/Thesis/BERT/0_4/bert_output' 


In [None]:
import csv
label_results=[]
pd.DataFrame()
with open('/content/drive/My Drive/Thesis/BERT/0_4/bert_output/test_results.tsv') as file:
    rd = csv.reader(file, delimiter="\t")
    for row in rd:
      data_1=[float(i) for i in row]
      label_results.append(data_1.index(max(data_1)))
df_results=pd.DataFrame()
df_results=pd.DataFrame(label_results)
df_results[0]

0        0
1        0
2        0
3        0
4        0
        ..
49995    1
49996    4
49997    3
49998    0
49999    0
Name: 0, Length: 50000, dtype: int64

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(data_test[0], df_results[0])

array([[7626, 1978,  238,   68,   90],
       [2006, 5858, 1834,  219,   83],
       [ 248, 2004, 5487, 1996,  265],
       [  64,  200, 1644, 5706, 2386],
       [  69,   71,  221, 2175, 7464]])

In [None]:
from sklearn import metrics
print("Accuracy",metrics.accuracy_score(data_test[0], df_results[0]))
print("F1-Score",metrics.f1_score(data_test[0], df_results[0],average='weighted'))

Accuracy 0.64282
F1-Score 0.6422812215369355


In [None]:
BERT_0_4_data_temp = pd.read_csv('/content/drive/My Drive/Thesis/Output_backup.csv')
print(BERT_0_4_data_temp.shape)

BERT_0_4_data = pd.DataFrame(columns=['id','text'])
BERT_0_4_data['id'] = BERT_0_4_data_temp['Meta Sentence ID']
BERT_0_4_data['text'] = BERT_0_4_data_temp['Sentences']
BERT_0_4_data
BERT_0_4_data.to_csv('/content/drive/My Drive/Thesis/BERT/bert_data/test.tsv', sep='\t', index=False, header=True)

(42083, 15)


In [None]:
#os.chdir('/content/drive/My Drive/Thesis/BERT/bert-master')
!python run_classifier_0_4.py --task_name=cola --do_predict=true --data_dir='/content/drive/My Drive/Thesis/BERT/bert_data' --vocab_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/vocab.txt' --bert_config_file='/content/drive/My Drive/Thesis/BERT/uncased_L-12_H-768_A-12/bert_config.json' --init_checkpoint='/content/drive/My Drive/Thesis/BERT/0_4/bert_output/model.ckpt-23203' - --max_seq_length=128 --output_dir='/content/drive/My Drive/Thesis/BERT/0_4/bert_output' 





W0702 19:38:42.854340 139915296675712 module_wrapper.py:139] From run_classifier_0_4.py:784: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0702 19:38:42.854593 139915296675712 module_wrapper.py:139] From run_classifier_0_4.py:784: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0702 19:38:42.855247 139915296675712 module_wrapper.py:139] From /content/drive/My Drive/Thesis/BERT/bert-master/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0702 19:38:43.559525 139915296675712 module_wrapper.py:139] From run_classifier_0_4.py:808: The name tf.gfile.MakeDirs is deprecated. Please use tf.io.gfile.makedirs instead.

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addon

In [None]:
label_results=[]
pd.DataFrame()
with open('/content/drive/My Drive/Thesis/BERT/0_4/bert_output/test_results.tsv') as file:
    rd = csv.reader(file, delimiter="\t")
    for row in rd:
      data_1=[float(i) for i in row]
      label_results.append(data_1.index(max(data_1)))
df_results=pd.DataFrame()
df_results=pd.DataFrame(label_results)
df_results[0].mean()


1.5209942256968372