<a href="https://colab.research.google.com/github/Lrraymond13/collab/blob/master/TensorFlow_with_GPU_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate ELMo Embeddings with GPU

* this generates a series of ELMo embeddings reading/writing to google drive



## Enabling and testing the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

* To avoid saving output, check the "Omit code cell output when saving this notebook" option

- At the moment, elmo tensorflow hub module is not compatible with tensorflow 2 so need to specifically import tensorflow 1.x

In [0]:
# 
# tf.report_tensor_allocations_upon_oom(True)
%tensorflow_version 1.x
import tensorflow as tf
tf.debugging.set_log_device_placement(True)
device_name = tf.test.gpu_device_name()
# this checks that the GPU is being used
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [0]:
import pandas as pd
import numpy as np
import os
import tensorflow_hub as hub

In [0]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

In [0]:
printm() 

Mount Google Drive Folder with Collab Data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
config = tf.ConfigProto()
# this allows memory allocation to increase over time
config.gpu_options.allow_growth = True

In [0]:
HOME_ROOT_DIR = '/content/drive/My Drive/collab_data'
PARSED_PATVIEW_DOWNLOADS_DIR = os.path.join(HOME_ROOT_DIR, 'parsed_api_downloads')

print(PARSED_PATVIEW_DOWNLOADS_DIR)
embeds_filename = 'elmo_embeds/{}/chunk_{}.csv'

Import Code that Generates Elmo EMbeds

In [0]:

def fetch_elmo_embeds(year, chunk_number, ids, list_sentences, column_prefix):
    # list sentences should be a list of sentences ()
    # these should be processed by the clean for embeddings fnc and then cut to a max length
    elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
      # We set the trainable parameter to True when creating the module so that the 4 scalar weights (as described in the paper) can be trained.
      # In this setting, the module still keeps all other parameters fixed.
      # default signature takes untokenized sentences as inputs
      # The input tensor is a string tensor with shape [batch_size]. The module tokenizes each string by splitting on spaces.
      # output dictionary option default: a fixed  mean - pooling of all contextualized word representations with shape[batch_size, 1024]
    embeddings = elmo(
        list_sentences, signature="default", as_dict=True)["default"]
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        embeds = sess.run(embeddings)
    tf.keras.backend.clear_session()
    # this backend line is absolutely crucial or tf will hold on to memory allocated in previous sessions and throw OOM
    embeds_df = pd.DataFrame(
        index=ids, data=embeds,
        columns=['{}_{}'.format(column_prefix, create_column_name(i)) for i in np.arange(0, 1024)])
    print(embeds.shape)
    embeds_df.reset_index(drop=False).to_csv(os.path.join(HOME_ROOT_DIR, embeds_filename.format(str(year), chunk_number)))
    del embeds
    del embeds_df
    return None


def create_column_name(n):
    if len(str(n)) == 1:
        return '000{}'.format(str(n))
    elif len(str(n)) == 2:
        return '00{}'.format(str(n))
    elif len(str(n)) == 3:
        return '0{}'.format(str(n))
    return str(n)


def combine_title_abstract(x):
    # this combines title and abstract
    # this takes in a pandas series with index is column names and values being text strings
    txt, abstr = x.values
    if not pd.isnull(txt) and not pd.isnull(abstr):
        return ' . '.join([txt, abstr])
    if not pd.isnull(abstr):
        return abstr
    # this should not be hit - we only look at patents with abstracts
    return txt


def feed_text_to_elmo(yr, column_prefix='abstract_embeds'):
    # for each of the US patents in the dataframe, feed the cleaned abstract to ELMO
    # the prep text for the embedding function uses the first 300 words of the abstract
    parsed_df_format4 = 'patview_citations_df4_{}.csv'
    input_filename = os.path.join(PARSED_PATVIEW_DOWNLOADS_DIR, parsed_df_format4.format(str(yr)))
    print('Reading in ', input_filename)
    df = pd.read_csv(input_filename, usecols=['title_text', 'first_claim_text_cleaned',
                                    'abstract_text_cleaned', 'patent_id_integer'])
    abstract_colname = 'abstract_text_cleaned'
    # abstract_colname = 'abstract_text'
    df = df.sort_values('patent_id_integer').set_index('patent_id_integer')

    df_text = df.loc[~pd.isnull(
        df[abstract_colname]), ['title_text', abstract_colname, ]].copy()
    # missing_text = df.loc[~df.index.isin(df_text.index),:].copy()
    print('Original df ', df.shape, ' abstract text shape ', df_text.shape)
    del df
    df_text['title_plus_abstract'] = df_text[['title_text',
                                              abstract_colname]].apply(combine_title_abstract, axis=1)
    print('Text df shape ', df_text.shape)                                           
    # currently doing from embeds 200 on (because already did first 200)
    #starting_integer = 321
    #text_splits = np.array_split(df_text['title_plus_abstract'], 500)[:starting_integer]
    # needed to change this because np bug on the split function 
    text_splits = np.array_split(df_text[['title_plus_abstract']].values, 500)
    index_splits = np.array_split(df_text[['title_plus_abstract']].index, 500)
    #i = starting_integer
    for i, r in enumerate(zip(index_splits, text_splits)):
      chunk_index = r[0]
      if len(r[1].shape) > 1:
          # select the first dimension
          chunk_text = r[1][:, 0]
      else:
          chunk_text = r[1]
      print('On iteration ', i)
      try:
          # we need to feed this a 1-D numpy array of text
          fetch_elmo_embeds(yr, i, chunk_index, chunk_text.tolist(), column_prefix)
      except tf.errors.ResourceExhaustedError as e:
          print('Chunk number OOM Error')
          print(printm())
      except ValueError as z:
        print('Value Error from Dimension issue', 'Chunk shape ', chunk.shape, 'chunk strings', chunk.values)
      # i+=1
    print('Done with embeddings')
    return None
        # for counter, mini_chunk in enumerate(np.array_split(chunk, 3)):
        #     if counter==0:
        #         chunk_number = '{}_a'.format(str(i))
        #     elif counter==1:
        #         chunk_number = '{}_b'.format(str(i))
        #     else:
        #         chunk_number = '{}_c'.format(str(i))
        #     # each chunk is a series
        #     print('On iteration ', i)
        #     try:
        #       fetch_elmo_embeds(yr, chunk_number, mini_chunk.index, mini_chunk.values, embeds_filename)
        #     except tf.errors.ResourceExhaustedError as e:
        #       print('Chunk number OOM Error')
        #       print(printm())
              


In [0]:
feed_text_to_elmo(2009)

In [0]:
tf.keras.backend.clear_session()

In [0]:
parsed_df_format4 = 'patview_citations_df4_{}.csv'
yr = 2003
input_filename = os.path.join(PARSED_PATVIEW_DOWNLOADS_DIR, parsed_df_format4.format(str(yr)))
df = pd.read_csv(input_filename, usecols=['title_text', 'first_claim_text_cleaned',
                                    'abstract_text_cleaned', 'patent_id_integer'])