In [None]:
!pip install tensorflow_hub
!pip install keras tf-models-official pydot graphviz

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

from keras.utils import np_utils

import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization as tokenization

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

1 Physical GPUs, 1 Logical GPUs
Version:  2.4.1
Eager mode:  True
Hub version:  0.11.0
GPU is available


In [None]:
os.chdir('/notebooks/Data')
combined_df = pd.read_csv('Combined_sentiment_analysis.csv', encoding='ISO-8859-1')

In [None]:
combined_df = combined_df.rename(columns={'ï»¿Datetime': 'Datetime'})

In [None]:
combined_df['Datetime'] = pd.to_datetime(combined_df['Datetime'])

combined_df['wkd_value'] = combined_df['Datetime'].dt.dayofweek

# create a list of our conditions
conditions = [
    (combined_df['wkd_value'] == 0),
    (combined_df['wkd_value'] == 1),
    (combined_df['wkd_value'] == 2),
    (combined_df['wkd_value'] == 3),
    (combined_df['wkd_value'] == 4),
    (combined_df['wkd_value'] == 5),
    (combined_df['wkd_value'] == 6)
    ]

# create a list of the values we want to assign for each condition
values = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# create a new column and use np.select to assign values to it using our lists as arguments
combined_df['wkd'] = np.select(conditions, values)

# display updated DataFrame
combined_df.head(5)

Unnamed: 0,Datetime,Tweet Id,Text,Retweets,LikeCount,QuoteCount,TextClean,TextCleanLemm,tb_score,afinn_score,vader_com,vader_pos,vader_neg,vader_neu,Brand,wkd_value,wkd
0,2013-10-01 23:59:47+00:00,3.851924e+17,TRIPP LITE 6SP 6 Outlets Power Strip $28.99 ht...,0.0,0.0,0.0,tripp lite sp outlets power strip,tripp lite sp outlet power strip,0.0,0.0,0.0,0.0,-0.0,1.0,TrippLite,1,Tue
1,2013-10-01 23:17:54+00:00,3.851818e+17,2yr Warranty Bonus-Tripplite Tripp Lite v1.3 2...,0.0,0.0,0.0,yr warranty bonus tripplite tripp lite v port ...,yr warranty bonus tripplite tripp lite v port ...,0.0,0.0,0.5423,0.304,-0.0,0.696,TrippLite,1,Tue
2,2013-10-01 23:17:22+00:00,3.851817e+17,Tripp Lite Lite ProtectIT SUPER7COAX 120VAC Su...,0.0,0.0,0.0,tripp lite lite protectit super coax vac surge...,tripp lite lite protectit super coax vac surge...,0.333333,3.0,0.5994,0.281,-0.0,0.719,TrippLite,1,Tue
3,2013-10-01 23:13:22+00:00,3.851807e+17,Tripp Lite Surge Protector Strip 120V 6 Outlet...,0.0,0.0,0.0,tripp lite surge protector strip v outlet ft c...,tripp lite surge protector strip v outlet ft c...,-0.166667,0.0,0.0,0.0,-0.0,1.0,TrippLite,1,Tue
4,2013-10-01 22:54:06+00:00,3.851758e+17,Tripp Lite P516-001 VGA/XVGA Monitor Y Splitte...,0.0,0.0,0.0,tripp lite p vga xvga monitor splitter hd xf,tripp lite p vga xvga monitor splitter hd xf,0.0,0.0,0.0,0.0,-0.0,1.0,TrippLite,1,Tue


## Load label encoder

In [None]:
os.chdir('/notebooks/BERT_model')
encoder_fname = 'twitter_classes.npy'
my_wd = os.getcwd()

encoder = LabelEncoder()
encoder.classes_ = np.load(os.path.join(my_wd, encoder_fname), allow_pickle=True)

## Load feature encoder

In [None]:
os.chdir('/notebooks/BERT_model')
encoder_fname = 'twitter_wkd.npy'
my_wd = os.getcwd()

featureEncoderSaved = LabelEncoder()
featureEncoderSaved.classes_ = np.load(os.path.join(my_wd, encoder_fname), allow_pickle=True)

In [None]:
# Checkpoint
print(combined_df['wkd'].value_counts(ascending=True))

print(encoder.classes_)

print(featureEncoderSaved.classes_)

Sun    43835
Sat    44176
Mon    45984
Thu    46395
Fri    46732
Wed    46822
Tue    47328
Name: wkd, dtype: int64
[0 4]
['Fri' 'Mon' 'Sat' 'Sun' 'Thu' 'Tue' 'Wed']


## Load model

In [None]:
#import zipfile
#os.chdir('/notebooks/BERT_model')
#with zipfile.ZipFile('/notebooks/twitter_BERT_wWKD.zip', 'r') as zip_ref:
#    zip_ref.extractall('/notebooks/BERT_model')

In [None]:
os.chdir('/notebooks/BERT_model')
model_fname = 'twitter_BERT_wWKD'
my_wd = os.getcwd()

tokenizerSaved = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(my_wd, model_fname, 'assets/vocab.txt'),
    do_lower_case=False)

In [None]:
new_model = tf.keras.models.load_model(os.path.join(my_wd, model_fname))

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run Identity: Dst tensor is not initialized. [Op:Identity]

In [None]:
def encode_names(n, tokenizer):
   tokens = list(tokenizer.tokenize(n))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(string_list,
                tokenizer, 
                new_feature,  # [NEW]
                new_feature_class_count,  # [NEW] 
                max_seq_length):  
  num_examples = len(string_list)
  
  string_tokens = tf.ragged.constant([
      encode_names(n, tokenizer) for n in np.array(string_list)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*string_tokens.shape[0]
  input_word_ids = tf.concat([cls, string_tokens], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor(shape=(None, max_seq_length))

  type_cls = tf.zeros_like(cls)
  type_tokens = tf.ones_like(string_tokens)
  input_type_ids = tf.concat(
      [type_cls, type_tokens], axis=-1).to_tensor(shape=(None, max_seq_length))
  feature = tf.ragged.constant(new_feature).to_tensor(shape=(None, new_feature_class_count))  # [NEW]

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(shape=(None, max_seq_length)),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids,
      'additional_feature': feature}  # [NEW]

  return inputs

In [None]:
sample_size = int(len(combined_df)*0.2)
sampleDf = combined_df.sample(sample_size, random_state=4222)
print(sampleDf.Brand.value_counts(ascending=True))
sampleDf = sampleDf.reset_index()
print(sampleDf.shape)

BossAudio        460
Sony            4446
Pwr+            6237
TrippLite       6273
Garmin          6326
PolkAudio       6335
Sangean         6446
Belkin          6499
Apple           7129
YamahaAudio    14103
Name: Brand, dtype: int64
(64254, 18)


In [None]:
LOG_EVERY_N = 100
result = []
for i in range(len(sampleDf)):
  tweet = [sampleDf['TextCleanLemm'][i]]
  wkd = [sampleDf['wkd'][i]]

  dummy_wkd = np_utils.to_categorical(featureEncoderSaved.transform(np.array(wkd)))  # encodes weekday

  inputs = bert_encode(string_list=list(tweet),
                      tokenizer=tokenizerSaved,
                      new_feature=dummy_wkd,
                      new_feature_class_count=7,
                      max_seq_length=240)
  prediction = new_model.predict(inputs)
  if (encoder.classes_[np.argmax(prediction)]==4) == True:
    result.append(1)
  else:
    result.append(0)
  if (i % LOG_EVERY_N) == 0:
    print('Tweet ', i, ' is', 'positive' if encoder.classes_[np.argmax(prediction)]==4 else 'negative')
  else:
    continue

sampleDf["Sentiment"] = result

In [None]:
os.chdir('/notebooks/Data')
sampleDf.to_csv("Brand_BERT_sentiment2.csv", index=False)