# Text Similarity Analysis with tf.transform, Dataflow, and BigQuery

This tutorial shows how to use document similarity analysis as follows:

* Raw text data is stored in GCS (or local file system)
* Data is processed using Apache Beam (on Dataflow) + tf.transform to produce
  * TF.IDF
  * Text Embedding using tf.hub text_embeddings modules
  * Bag of Words (BOW) indices (vocab ids)
* Each doc + Embeddings iw inserted to BigQuery
* Similarity Analysis between docs in BigQuery are performed using Cosine Similarity

The dataset used is **Reuters-21578**. Details of this corpus is found in this link: http://www.daviddlewis.com/resources/testcollections/reuters21578/




## Import Libraries

In [1]:
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_transform.coders as tft_coders

from tensorflow_transform.beam import impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata

import apache_beam as beam

Instructions for updating:
Use the retry module or similar alternatives.


## (Optional) Download Reuters Corpus

The **Reuters-21578** corpus can be downloaded from a publicly accessible Google Cloud Storage bucket: **gs://cloudml-textanalysis/data/reuters.tar.gz**. After downloading and unzipping the archive folder, the the corpus is found divided to **train** and **eval** directories. Each directory contains documents, one for each article, organized in sub-directories, one for each **topic**. 

The following is the structure of the unzipped directories:

* train
    * ...
    * coconut **# topic**
    * coffee
    * ..
    * gas **# topic**
        * 0000218 **# document**
        * 0000223
        * ...
    * gold
        * 0000319
        * ...
    * ...

In [2]:
# %%bash 

# gsutil cp gs://cloudml-textanalysis/data/reuters.tar.gz ./data
# gunzip -c data/reuters.tar.gz | tar xopf -

## Set Global Flags

In [3]:
import os

class Params:
    pass


Params.MODULE_URL = 'https://tfhub.dev/google/universal-sentence-encoder/1'
#Params.MODULE_URL = 'https://tfhub.dev/google/nnlm-en-dim50/1'

Params.GCP_PROJECT_ID = 'ksalama-gcp-playground'
Params.BQ_DATASET = 'playground_ds'
Params.BQ_TABLE = 'reuters_embeddings'

Params.ROOT = '.' # 'gs://cloudml-textanalysis' | '.'

Params.DATA_DIR = os.path.join(Params.ROOT, 'data/reuters')
Params.TRANSFORMED_DATA_DIR = os.path.join(Params.DATA_DIR, 'transformed')


Params.RAW_TRAIN_DATA_DIR = os.path.join(Params.DATA_DIR, 'train')
Params.RAW_EVAL_DATA_DIR = os.path.join(Params.DATA_DIR, 'eval')

Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'train/docs-encoding')
Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'eval/docs-encoding')
Params.TRANSFORMED_ENCODING='text'

Params.TEMP_DIR = os.path.join(Params.DATA_DIR, 'tmp')

Params.MODELS_DIR = os.path.join(Params.ROOT, 'models/reuters')

Params.TRANSFORM_ARTEFACTS_DIR = os.path.join(Params.MODELS_DIR,'transform')

Params.RUNNER = 'DirectRunner' # DirectRunner | DataflowRunner

Params.TEST_MODE = True # process only few docs for testing

Params.TEST_MODE_SAMPLE = 3 # number of topics & number of docs in each topic to process

## Create Sink BigQuery Table Schema

In [4]:
def create_bigquey_schema():

    from apache_beam.io.gcp.internal.clients import bigquery
    
    table_schema = bigquery.TableSchema()

    topic_schema = bigquery.TableFieldSchema()
    topic_schema.name = 'topic'
    topic_schema.type = 'string'
    topic_schema.mode = 'nullable'
    table_schema.fields.append(topic_schema)
    
    title_schema = bigquery.TableFieldSchema()
    title_schema.name = 'title'
    title_schema.type = 'string'
    title_schema.mode = 'nullable'
    table_schema.fields.append(title_schema)
    
    embed_schema = bigquery.TableFieldSchema()
    embed_schema.name = 'embeddings'
    embed_schema.type = 'float'
    embed_schema.mode = 'repeated'
    table_schema.fields.append(embed_schema)
    
    return table_schema

## Define Data Processing Pipeline

In [5]:
import os

DELIMITERS = '.,!?() '
VOCAB_SIZE = 50000

def get_paths(directory):
    
    import tensorflow as tf
    sub_directories = tf.gfile.ListDirectory(directory)
    
    if Params.TEST_MODE:
        sub_directories = sub_directories[:Params.TEST_MODE_SAMPLE]
        
    return [os.path.join(directory,path) for path in sub_directories if '.DS_' not in path]

def get_name_and_content(file_name):
    
    import tensorflow as tf
    content = tf.gfile.GFile(file_name).read()
    return file_name, content

def get_title_and_topic((file_name, content)):
    
    topic = file_name.split('/')[-2]
    title = content.split('\r')[1].replace('\n','')
    return topic, title

def clean_text(text):
    
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    import nltk
    
    try:
        nltk.data.find('corpora/stopwords')
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
        nltk.download('stopwords')
    
    stop_words = stopwords.words('english')
    tokenized_words = word_tokenize(text.lower())
    tokenized_words = [''.join(c for c in word if c.isalnum()) for word in tokenized_words]
    clean_text = ' '.join([word.strip() for word in tokenized_words if word not in stop_words and word != ''])
    return clean_text

def to_dictionary(input_tuple):
    
    output_dict = dict()
    output_dict['topic'] = input_tuple[0]
    output_dict['raw_title'] = input_tuple[1]
    output_dict['clean_title'] = input_tuple[2]
    return output_dict

def get_raw_metadata():
    
    raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
    'topic': dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
    'raw_title': dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
    'clean_title': dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
    }))
    
    return raw_metadata

def get_embeddings(text):
    
    import tensorflow_hub as hub
    embed = hub.Module(Params.MODULE_URL)
    embeddings = embed(text)
    return embeddings

def preprocessing_fn(input_features):
    
    # get the text of  clean_title
    text = input_features['clean_title']
    
    # extract embeddings using tf.hub
    embeddings = tft.apply_function(get_embeddings, text)

    # tokenize text
    text_tokens = tf.string_split(text, DELIMITERS)
    
    # bag of words (bow) indicies
    text_tokens_indices = tft.string_to_int(text_tokens, top_k=VOCAB_SIZE)
    
    # tf.idf
    bag_of_words_indices, tf_idf = tft.tfidf(text_tokens_indices, VOCAB_SIZE + 1)
    
    output_features = dict()
    output_features['topic'] = input_features['topic']
    output_features['title'] = input_features['raw_title']
    output_features['bow'] = bag_of_words_indices
    output_features['tf_idf'] = tf_idf
    output_features['embeddings'] = embeddings
    
    return output_features
    

def to_bq_row(entry):
    
    valid_embeddings = [round(float(e), 3) for e in entry['embeddings']]
    
    return {
        "topic": entry['topic'],
        "title": entry['title'],
        "embeddings": valid_embeddings
    }  


############ Beam Pipeline Functions ####################################

def read_raw_data(pipeline, source, step):
    
    raw_data = (
        pipeline
        | '{} - Get Directories'.format(step) >> beam.Create(get_paths(source))
        | '{} - Get Files'.format(step) >> beam.FlatMap(get_paths)
        | '{} - Read Content'.format(step) >> beam.Map(get_name_and_content)
        | '{} - Get Title & Topic'.format(step) >> beam.Map(get_title_and_topic)
        | '{} - Clean Title'.format(step) >> beam.Map(lambda (topic, title): (topic, title, clean_text(title)))
        | '{} - To Dictionary'.format(step) >> beam.Map(to_dictionary)
    )
    
    return raw_data

def write_to_files(dataset, sink, encoding, step):

    data, metadata = dataset
    
    if encoding == 'tfrecords':
        (
            data 
             | '{} - Write Transformed Data'.format(step) >> beam.io.tfrecordio.WriteToTFRecord(
                 file_path_prefix=sink,
                 file_name_suffix=".tfrecords",
                 coder=tft_coders.example_proto_coder.ExampleProtoCoder(metadata.schema))
        )

    else:
        (
            data 
              | '{} - Write Transformed Data'.format(step) >> beam.io.textio.WriteToText(
                  file_path_prefix=sink,
                  file_name_suffix=".txt")
        )
        
def write_to_bq(dataset, project_id, bq_dataset_name, bq_table_name, bq_table_schema, step):
    
    data, metadata = dataset
    
    (
        data
        | '{} - Convert to Valid BQ Row'.format(step) >> beam.Map(to_bq_row)
        | '{} - Write to BigQuery'.format(step) >> beam.io.WriteToBigQuery(
            project=project_id, dataset=bq_dataset_name, table=bq_table_name, schema=bq_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
        )
    )
    
#####################################################################################################

def run_transformation_pipeline(runner, options):
    
    options = beam.pipeline.PipelineOptions(flags=[], **options)
    with beam.Pipeline(runner, options=options) as pipeline:
        with impl.Context(Params.TEMP_DIR):
            
            ################## train data ##################
            
            step = 'Train'
            
            ### read raw train data
            raw_train_data = read_raw_data(pipeline, Params.RAW_TRAIN_DATA_DIR, step)

            ### create a dataset from the train data and schema
            raw_train_dataset = (raw_train_data, get_raw_metadata())
            
            ### analyze and transform raw_train_dataset to produced transformed_dataset and transform_fn
            transformed_train_dataset, transform_fn = (
                raw_train_dataset 
                | 'Analyze & Transform' >> impl.AnalyzeAndTransformDataset(
                    preprocessing_fn
                )
            )
        
            
            ### write transformed train data to files
            write_to_files(
                transformed_train_dataset, 
                Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX,
                Params.TRANSFORMED_ENCODING,
                step, 
            )
            
            ## write to train embeddings to BigQuery
            write_to_bq(
                transformed_train_dataset,
                Params.GCP_PROJECT_ID, 
                Params.BQ_DATASET, 
                Params.BQ_TABLE, 
                create_bigquey_schema(), 
                step
            )
            
            ################## eval data ##################
            
            step = 'Eval'

            ### read raw eval data
            raw_eval_data = read_raw_data(pipeline, Params.RAW_EVAL_DATA_DIR, step)
                        
            ### create a dataset from the train data and schema
            raw_eval_dataset = (raw_eval_data, get_raw_metadata())

            # transform eval data based on produced transform_fn (from analyzing train_data)
            transformed_eval_dataset = (
                (raw_eval_dataset, transform_fn) 
                | '{} - Transform'.format(step) >> impl.TransformDataset()
            )

            ### write transformed eval data to files
            write_to_files(
                transformed_train_dataset, 
                Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX,
                Params.TRANSFORMED_ENCODING,
                step, 
            )
            
            ## write to eval embeddings to BigQuery
            write_to_bq(
                transformed_eval_dataset,
                Params.GCP_PROJECT_ID, 
                Params.BQ_DATASET, 
                Params.BQ_TABLE, 
                create_bigquey_schema(), 
                step
            )

            ################## write transformation artefacts ##################
            
            (
                transform_fn 
                | 'Write Transform Artefacts' >> transform_fn_io.WriteTransformFn(
                    Params.TRANSFORM_ARTEFACTS_DIR
                )
            )


In [6]:
%%writefile requirements.txt
tensorflow>=1.7 
tensorflow-transform==0.6.0 
tensorflow-hub

Overwriting requirements.txt


In [7]:
print 'Source data location: {}'.format(Params.RAW_TRAIN_DATA_DIR)

if Params.TEST_MODE:
    print "Test Mode is ON"
    print ""
    
    doc_count = 0
    for directory in get_paths(Params.RAW_TRAIN_DATA_DIR):
        print directory
        print "-------------------------"
        for doc in get_paths(directory):
            print doc
            doc_count += 1
        print ""
        
    print 'Topic: {}'.format(Params.TEST_MODE_SAMPLE)
    print 'Docs to process: {}'.format(doc_count)
    
else:
    'Test Mode is Off. All the docs will be processed'

./data/reuters/train
Test Mode is ON
./data/reuters/train/tin
-------------------------
./data/reuters/train/tin/0007261
./data/reuters/train/tin/0000552
./data/reuters/train/tin/0005647

./data/reuters/train/l-cattle
-------------------------
./data/reuters/train/l-cattle/0003173
./data/reuters/train/l-cattle/0004107
./data/reuters/train/l-cattle/0000316

./data/reuters/train/linseed
-------------------------
./data/reuters/train/linseed/0000002
./data/reuters/train/linseed/0008699

Topic: 3
Docs to process: 8


## Run Pipeline

In [None]:
from datetime import datetime
 
try:
    tf.gfile.DeleteRecursively(Params.TRANSFORM_ARTEFACTS_DIR)
    tf.gfile.DeleteRecursively(Params.TRANSFORMED_DATA_DIR)
    tf.gfile.DeleteRecursively(Params.TEMP_DIR)
except:
    pass

print 'Transform directories are removed...'
print ''

tf.logging.set_verbosity(tf.logging.ERROR)

job_name = 'process-reuters-docs-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))
print 'Launching {} job {} ... hang on'.format(Params.RUNNER, job_name)
print("")

options = {
    'region': 'europe-west1',
    'staging_location': os.path.join(Params.TEMP_DIR,'staging'),
    'temp_location': Params.TEMP_DIR,
    'job_name': job_name,
    'project': Params.GCP_PROJECT_ID,
    'worker_machine_type': 'n1-standard-1',
    'max_num_workers': 50,
    'requirements_file': 'requirements.txt'
}

if Params.RUNNER == 'DirectRunner':
    
    time_start = datetime.utcnow() 
    print("Transformation job started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    run_transformation_pipeline(Params.RUNNER, options)
    time_end = datetime.utcnow() 
    print(".......................................")
    print("Transformation job finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Job elapsed time: {} ".format(time_elapsed.total_seconds()))
    
else:
    
    run_transformation_pipeline(Params.RUNNER, options)
    print("Dataflow job submitted successfully...")

Transform directories are removed...

Launching DirectRunner job process-reuters-docs-180527-141215 ... hang on

Transformation job started at 14:12:15
.......................................


