In [19]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import Tokenizer
import re
from tqdm import tqdm
import tensorflow as tf

In [2]:
# create a local SparkSession
spark = SparkSession.builder \
                .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
                .appName("Normalize data") \
                .getOrCreate()

# define a streaming query
bronze_df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
                    .option('spark.mongodb.input.uri', 'mongodb://admin:nhanbui@localhost:27017/imcp.bronze.parquet?authSource=admin') \
                    .load()

print(bronze_df.printSchema())

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- caption: string (nullable = true)
 |-- created_time: timestamp (nullable = true)
 |-- howpublished: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- short_caption: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: string (nullable = true)

None


In [3]:
bronze_rdd = bronze_df.rdd
bronze_rdd.take(5)

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption='a kitchen with wooden cabinets on the walls, a stove, multiple drawers, a refrigerator, a counter with fruits, and a well-organized layout for cooking and storage needs.', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='Well-organized kitchen with wooden cabinets, a stove, multiple drawers, a refrigerator, counter space with fruits, and a clutter-free layout for efficient cooking and storage needs.', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption='multiple people wearing sweatshirts, a person on a bicycle performing tricks, and another person mid-jump off a skateboarding ramp. The backdrop consists of buildings with graffiti artworks, adding a vibrant feel to the urban setting. The image appears to be set i

In [4]:
# lower case captions
def lower_case(row):
    lowered_caption = row['caption'].lower()
    lowered_shrtcaption = row['short_caption'].lower()
    return Row(_id=row['_id'], caption=lowered_caption, created_time=row['created_time'], howpublished=row['howpublished'],
               publisher=row['publisher'], short_caption=lowered_shrtcaption, url=row['url'], year=row['year'])

bronze_rdd = bronze_rdd.map(lower_case)
bronze_rdd.take(5)

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption='a kitchen with wooden cabinets on the walls, a stove, multiple drawers, a refrigerator, a counter with fruits, and a well-organized layout for cooking and storage needs.', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='well-organized kitchen with wooden cabinets, a stove, multiple drawers, a refrigerator, counter space with fruits, and a clutter-free layout for efficient cooking and storage needs.', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption='multiple people wearing sweatshirts, a person on a bicycle performing tricks, and another person mid-jump off a skateboarding ramp. the backdrop consists of buildings with graffiti artworks, adding a vibrant feel to the urban setting. the image appears to be set i

In [5]:
# remove punctuation
def remove_punctuations(row):
    caption = re.sub(r'[^\w\d\s]', '', row['caption'])
    shrt_caption = re.sub(r'[^\w\d\s]', '', row['short_caption'])
    return Row(_id=row['_id'], caption=caption, created_time=row['created_time'], howpublished=row['howpublished'],
            publisher=row['publisher'], short_caption=shrt_caption, url=row['url'], year=row['year'])
    
bronze_rdd = bronze_rdd.map(remove_punctuations)
bronze_rdd.take(5)

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption='a kitchen with wooden cabinets on the walls a stove multiple drawers a refrigerator a counter with fruits and a wellorganized layout for cooking and storage needs', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='wellorganized kitchen with wooden cabinets a stove multiple drawers a refrigerator counter space with fruits and a clutterfree layout for efficient cooking and storage needs', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption='multiple people wearing sweatshirts a person on a bicycle performing tricks and another person midjump off a skateboarding ramp the backdrop consists of buildings with graffiti artworks adding a vibrant feel to the urban setting the image appears to be set in an urban skate park

In [30]:
# tokenize the caption and short_caption
def tokenize(row):
    caption = row['caption'].split()
    shrt_caption = row['short_caption'].split()
    return Row(_id=row['_id'], caption=caption, created_time=row['created_time'], howpublished=row['howpublished'],
            publisher=row['publisher'], short_caption=shrt_caption, url=row['url'], year=row['year'])

temp = bronze_rdd.map(tokenize)
temp.take(5)

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption=['a', 'kitchen', 'with', 'wooden', 'cabinets', 'on', 'the', 'walls', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'a', 'counter', 'with', 'fruits', 'and', 'a', 'wellorganized', 'layout', 'for', 'cooking', 'and', 'storage', 'needs'], created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption=['wellorganized', 'kitchen', 'with', 'wooden', 'cabinets', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'counter', 'space', 'with', 'fruits', 'and', 'a', 'clutterfree', 'layout', 'for', 'efficient', 'cooking', 'and', 'storage', 'needs'], url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption=['multiple', 'people', 'wearing', 'sweatshirts', 'a', 'person', 'on', 'a', 'bicycle', 'performing', 'tricks', 'and', 'another'

In [7]:
# create corpus from words in caption and short_caption

# TODO: collect words from caption and short_caption
sentences = []
for row in tqdm(bronze_rdd.collect()):
    sentences += [row['caption'], row['short_caption']]
    
# TODO: use Tensorflow to create corpus
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!“"”#$%&()*+,-./:;<=>?@[\]^`{|}~ ')
# create vocabulary (corpus) for each word in N sentences
tokenizer.fit_on_texts(sentences)

print(len(tokenizer.word_index))
tokenizer.word_index

100%|██████████| 217868/217868 [00:00<00:00, 390455.56it/s]


In [28]:
def tokenize_sequence(row, corpus):
    sentences = [row['caption'], row['short_caption']]
    tokens= []
    for sentence in sentences:
        words = sentence.split()
        tokens.append([corpus[word] for word in words])
    return Row(_id=row['_id'], 
                caption=tokens[0], 
                created_time=row['created_time'], 
                howpublished=row['howpublished'],
                publisher=row['publisher'],
                short_caption=tokens[1], 
                url=row['url'], year=row['year'])

In [29]:
corpus = dict(tokenizer.word_index)
temp = bronze_rdd.map(lambda x: tokenize_sequence(x, corpus))
print(temp.take(5))

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption=[1, 109, 4, 95, 363, 7, 2, 337, 1, 439, 30, 484, 1, 361, 1, 563, 4, 710, 3, 1, 1937, 1595, 29, 729, 3, 719, 3816], created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption=[1937, 109, 4, 95, 363, 1, 439, 30, 484, 1, 361, 563, 136, 4, 710, 3, 1, 10495, 1595, 29, 2940, 729, 3, 719, 3816], url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'), Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption=[30, 52, 23, 2760, 1, 36, 7, 1, 362, 519, 1386, 3, 50, 36, 5179, 622, 1, 688, 1075, 2, 97, 504, 6, 125, 4, 1242, 2556, 1033, 1, 391, 1621, 10, 2, 121, 32, 2, 14, 43, 10, 19, 107, 5, 13, 121, 1375, 230, 9, 1, 1828, 46, 29, 891, 445, 242, 1, 273], created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-L

In [31]:
spark.stop()