In [38]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import Tokenizer
import re
from tqdm import tqdm
import tensorflow as tf
import pymongo

In [39]:
# create a local SparkSession
spark = SparkSession.builder \
                .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
                .appName("Normalize data") \
                .getOrCreate()

# define a streaming query
bronze_df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
                    .option('spark.mongodb.input.uri', 'mongodb://admin:nhanbui@localhost:27017/imcp.bronze.parquet?authSource=admin') \
                    .load()

print(bronze_df.printSchema())

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- caption: string (nullable = true)
 |-- created_time: timestamp (nullable = true)
 |-- howpublished: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- short_caption: string (nullable = true)
 |-- url: string (nullable = true)
 |-- year: string (nullable = true)

None


In [40]:
bronze_rdd = bronze_df.rdd
bronze_rdd.take(5)

[Row(_id=Row(oid='66cdae3c475cfb405938d033'), caption='a kitchen with wooden cabinets on the walls, a stove, multiple drawers, a refrigerator, a counter with fruits, and a well-organized layout for cooking and storage needs.', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='Well-organized kitchen with wooden cabinets, a stove, multiple drawers, a refrigerator, counter space with fruits, and a clutter-free layout for efficient cooking and storage needs.', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(_id=Row(oid='66cdae3c475cfb405938d035'), caption='multiple people wearing sweatshirts, a person on a bicycle performing tricks, and another person mid-jump off a skateboarding ramp. The backdrop consists of buildings with graffiti artworks, adding a vibrant feel to the urban setting. The image appears to be set i

In [41]:
# lower case captions
def lower_case(row):
    lowered_caption = row['caption'].lower()
    lowered_shrtcaption = row['short_caption'].lower()
    return Row(caption=lowered_caption, created_time=row['created_time'], howpublished=row['howpublished'],
               publisher=row['publisher'], short_caption=lowered_shrtcaption, url=row['url'], year=row['year'])

bronze_rdd = bronze_rdd.map(lower_case)
bronze_rdd.take(5)

[Row(caption='a kitchen with wooden cabinets on the walls, a stove, multiple drawers, a refrigerator, a counter with fruits, and a well-organized layout for cooking and storage needs.', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='well-organized kitchen with wooden cabinets, a stove, multiple drawers, a refrigerator, counter space with fruits, and a clutter-free layout for efficient cooking and storage needs.', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(caption='multiple people wearing sweatshirts, a person on a bicycle performing tricks, and another person mid-jump off a skateboarding ramp. the backdrop consists of buildings with graffiti artworks, adding a vibrant feel to the urban setting. the image appears to be set in an urban skate park or a designated area for extreme sports within a city.', cre

In [42]:
# remove punctuation
def remove_punctuations(row):
    caption = re.sub(r'[^\w\d\s]', '', row['caption'])
    shrt_caption = re.sub(r'[^\w\d\s]', '', row['short_caption'])
    return Row(caption=caption, created_time=row['created_time'], howpublished=row['howpublished'],
            publisher=row['publisher'], short_caption=shrt_caption, url=row['url'], year=row['year'])
    
bronze_rdd = bronze_rdd.map(remove_punctuations)
bronze_rdd.take(5)

[Row(caption='a kitchen with wooden cabinets on the walls a stove multiple drawers a refrigerator a counter with fruits and a wellorganized layout for cooking and storage needs', created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption='wellorganized kitchen with wooden cabinets a stove multiple drawers a refrigerator counter space with fruits and a clutterfree layout for efficient cooking and storage needs', url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(caption='multiple people wearing sweatshirts a person on a bicycle performing tricks and another person midjump off a skateboarding ramp the backdrop consists of buildings with graffiti artworks adding a vibrant feel to the urban setting the image appears to be set in an urban skate park or a designated area for extreme sports within a city', created_time=datetime.dat

In [43]:
# tokenize the caption and short_caption
def tokenize(row):
    caption = row['caption'].split()
    shrt_caption = row['short_caption'].split()
    return Row(caption=caption, created_time=row['created_time'], howpublished=row['howpublished'],
            publisher=row['publisher'], short_caption=shrt_caption, url=row['url'], year=row['year'])

temp = bronze_rdd.map(tokenize)
temp.take(5)

[Row(caption=['a', 'kitchen', 'with', 'wooden', 'cabinets', 'on', 'the', 'walls', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'a', 'counter', 'with', 'fruits', 'and', 'a', 'wellorganized', 'layout', 'for', 'cooking', 'and', 'storage', 'needs'], created_time=datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption=['wellorganized', 'kitchen', 'with', 'wooden', 'cabinets', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'counter', 'space', 'with', 'fruits', 'and', 'a', 'clutterfree', 'layout', 'for', 'efficient', 'cooking', 'and', 'storage', 'needs'], url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023'),
 Row(caption=['multiple', 'people', 'wearing', 'sweatshirts', 'a', 'person', 'on', 'a', 'bicycle', 'performing', 'tricks', 'and', 'another', 'person', 'midjump', 'off', 'a', 'skateboarding', 'ramp', 'the', 'backdrop', 'co

In [44]:
def tokenize_sequence(row, corpus):
    sentences = [row['caption'], row['short_caption']]
    tokens= []
    for sentence in sentences:
        words = sentence.split()
        tokens.append([corpus[word] for word in words])
    return Row(_id=row['_id'], 
                caption=tokens[0], 
                created_time=row['created_time'], 
                howpublished=row['howpublished'],
                publisher=row['publisher'],
                short_caption=tokens[1], 
                url=row['url'], year=row['year'])

In [45]:
refined_data = temp.collect()

In [46]:
lst = []
for row in tqdm(refined_data):
    lst.append(row.asDict(recursive=False))
    
print(lst[0])

100%|██████████| 217868/217868 [00:01<00:00, 129087.05it/s]

{'caption': ['a', 'kitchen', 'with', 'wooden', 'cabinets', 'on', 'the', 'walls', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'a', 'counter', 'with', 'fruits', 'and', 'a', 'wellorganized', 'layout', 'for', 'cooking', 'and', 'storage', 'needs'], 'created_time': datetime.datetime(2024, 8, 28, 0, 45, 15, 155000), 'howpublished': 'https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', 'publisher': 'HuggingFace', 'short_caption': ['wellorganized', 'kitchen', 'with', 'wooden', 'cabinets', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'counter', 'space', 'with', 'fruits', 'and', 'a', 'clutterfree', 'layout', 'for', 'efficient', 'cooking', 'and', 'storage', 'needs'], 'url': 'http://images.cocodataset.org/val2017/000000037777.jpg', 'year': '2023'}





In [47]:
# with pymongo.MongoClient("mongodb+srv://nhanbuimongogcp:nhanbui@mongdb-gcp-cluster.eozg9.mongodb.net/?retryWrites=true&w=majority&appName=mongdb-gcp-cluster") as client:
#     db = client["imcp"]
#     resp = db['refined'].insert_many(lst)

In [59]:
import json

with open("../airflow/config/env.json", "r") as file:
    config = json.load(file)
    mongo_url = config['mongodb']['MONGO_ATLAS_PYTHON_GCP']


# lower case captions
def lower_case(row):
    lowered_caption = row['caption'].lower()
    lowered_shrtcaption = row['short_caption'].lower()
    return Row(caption=lowered_caption, 
                created_time=row['created_time'], 
                howpublished=row['howpublished'],
                publisher=row['publisher'], 
                short_caption=lowered_shrtcaption, 
                url=row['url'], year=row['year'])

def remove_punctuations(row):
    caption = re.sub(r'[^\w\d\s]', '', row['caption'])
    shrt_caption = re.sub(r'[^\w\d\s]', '', row['short_caption'])
    return Row(caption=caption, 
                created_time=row['created_time'], 
                howpublished=row['howpublished'],
                publisher=row['publisher'], 
                short_caption=shrt_caption, 
                url=row['url'], year=row['year'])
    
def tokenize(row):
    caption = row['caption'].split()
    shrt_caption = row['short_caption'].split()
    return Row(caption=caption, 
                created_time=row['created_time'], 
                howpublished=row['howpublished'],
                publisher=row['publisher'], 
                short_caption=shrt_caption, 
                url=row['url'], year=row['year'],
                text_technique='T01')


def audit_log(start_time, end_time, status, error_message="", affected_rows=0, action=""):
    with pymongo.MongoClient(mongo_url) as client:
        db = client['imcp']
        log = {
            "layer": "silver",
            "table_name": "refined",
            "start_time": start_time,
            "end_time": end_time,
            "status": status,
            "error_message": error_message,
            "affected_rows": affected_rows,
            "action": action
        }
        db['audit'].insert_one(log)


def normalize_caption():
    # create a local SparkSession
    spark = SparkSession.builder \
                    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
                    .config("spark.driver.maxResultSize", "1g") \
                    .appName("readExample") \
                    .getOrCreate()

    # define a batch query
    bronze_df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
                        .option('spark.mongodb.input.uri', mongo_url) \
                        .option('spark.mongodb.input.database', 'imcp') \
                        .option('spark.mongodb.input.collection', 'bronze_layer') \
                        .load()
    
    # clean the data in RDD
    bronze_rdd = bronze_df.rdd
    bronze_rdd = bronze_rdd.map(lower_case)
    bronze_rdd = bronze_rdd.map(remove_punctuations)
    silver_rdd = bronze_rdd.map(tokenize).toDF()
    print(silver_rdd.take(1))
    
    # silver_rdd.toDF().write.format("com.mongodb.spark.sql.DefaultSource") \
    #                 .option('spark.mongodb.output.uri', mongo_url) \
    #                 .option('spark.mongodb.output.database', 'imcp') \
    #                 .option('spark.mongodb.output.collection', 'refined') \
    #                 .mode('append') \
    #                 .save()
            
    # # Convert to list of dicts
    datarows = silver_rdd.collect()
    refined_data = []
    for row in datarows:
        refined_data.append(row.asDict(recursive=False))
    print(refined_data[:1])
    
    # Stop spark instance
    spark.stop()
    
    
normalize_caption()

[Row(caption=['a', 'kitchen', 'with', 'wooden', 'cabinets', 'on', 'the', 'walls', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'a', 'counter', 'with', 'fruits', 'and', 'a', 'wellorganized', 'layout', 'for', 'cooking', 'and', 'storage', 'needs'], created_time=datetime.datetime(2024, 9, 10, 5, 25, 56, 734000), howpublished='https://huggingface.co/datasets/laion/220k-GPT4Vision-captions-from-LIVIS', publisher='HuggingFace', short_caption=['wellorganized', 'kitchen', 'with', 'wooden', 'cabinets', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'counter', 'space', 'with', 'fruits', 'and', 'a', 'clutterfree', 'layout', 'for', 'efficient', 'cooking', 'and', 'storage', 'needs'], url='http://images.cocodataset.org/val2017/000000037777.jpg', year='2023', text_technique='T01')]
[{'caption': ['a', 'kitchen', 'with', 'wooden', 'cabinets', 'on', 'the', 'walls', 'a', 'stove', 'multiple', 'drawers', 'a', 'refrigerator', 'a', 'counter', 'with', 'fruits', 'and', 'a', 'wellorganize

In [51]:
spark.stop()