## JSONL Filters and Transformations

In [46]:
import pandas as pd
import jsonlines
import re

def remove_emojis(text):
    # Regular expression to detect emojis and other special symbols
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002705"
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Function to join lists into a single string and then remove emojis
def clean_description(val):
    if isinstance(val, list):
        val = ' '.join(val)  # Join the list elements into a single string
    return remove_emojis(str(val)) if pd.notnull(val) else val

def process_amazon_reviews(meta_file, output_file):
    # Read the "meta" JSONL file and convert it to a DataFrame
    data_meta = []
    with jsonlines.open(meta_file) as reader:
        for obj in reader:
            data_meta.append(obj)

    df = pd.DataFrame(data_meta)

    # Filter rows where 'description' is empty
    df = df[df['description'].apply(lambda x: len(x) != 0)]
    df = df[df['features'].apply(lambda x: len(x) != 0)]

    # Keep only the necessary columns
    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]

    # Filter reviews with a rating > 4.5
    df = df[df['average_rating'].astype(float) > 4.5]

    # Filter reviews with a rating number > 200
    df = df[df['rating_number'].astype(int) > 200]

    # Apply the cleaning function to the 'description' column
    df['title'] = df['title'].apply(clean_description)
    df['description'] = df['description'].apply(clean_description)
    df['features'] = df['features'].apply(clean_description)

    # Replace curly apostrophes with straight ones in text fields
    df['title'] = df['title'].str.replace('’', "'")
    df['description'] = df['description'].str.replace('’', "'")
    df['features'] = df['features'].str.replace('’', "'")

    # Save the final DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    print(len(df))

# Example function call
process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')


4043


### Special case for Clothing dataset. Takes only one object out of every 3
Very big dataset

In [38]:
import pandas as pd
import jsonlines
import re


def remove_emojis(text):
    # Regular expression to detect emojis and other special symbols
    emoji_pattern = re.compile(
        "["  
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Symbols and Pictographs Extended-A
        "\U0001FA70-\U0001FAFF"  # Supplemental Objects
        "\U00002705"              # Miscellaneous Check Mark
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Function to join lists into a string and then remove emojis
def clean_description(val):
    if isinstance(val, list):
        val = ' '.join(val)  # Join the elements of the list into a single string
    return remove_emojis(str(val)) if pd.notnull(val) else val


def process_amazon_reviews(meta_file, output_file):
    # Read the "meta" JSONL file and convert it into a DataFrame
    data_meta = []
    with jsonlines.open(meta_file) as reader:
        for i, obj in enumerate(reader):
            if i % 3 == 0:  # Take only one object out of every 3
                data_meta.append(obj)

    df = pd.DataFrame(data_meta)

    # Filter rows where the description is empty
    df = df[df['description'].apply(lambda x: len(x) != 0)]
    df = df[df['features'].apply(lambda x: len(x) != 0)]
    
    # Keep only the necessary columns
    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]

    # Filter reviews with an average rating > 4.5
    df = df[df['average_rating'].astype(float) > 4.5]

    # Filter reviews with a rating number > 2000
    df = df[df['rating_number'].astype(int) > 2000]

    # Apply the cleaning function to 'title', 'description', and 'features' columns
    df['title'] = df['title'].apply(clean_description)
    df['description'] = df['description'].apply(clean_description)
    df['features'] = df['features'].apply(clean_description)

    # Replace special apostrophe characters in 'title', 'description', and 'features'
    df['title'] = df['title'].str.replace('’', "'")
    df['description'] = df['description'].str.replace('’', "'")
    df['features'] = df['features'].str.replace('’', "'")
   
    # Save the final DataFrame as a CSV file
    df.to_csv(output_file, index=False)

    print(len(df))


# Example call to the function
process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')


2933


## Concatenation of final csv

In [50]:
import os
import pandas as pd

# Path to the folder containing the CSV files
csv_folder = 'Data/Output/'

# List all files in the folder
csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]

# Read and concatenate all the CSV files
df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]
final_df = pd.concat(df_list, ignore_index=True)

# Save the final DataFrame into a single CSV file
final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)

# Print message confirming the concatenation
print(f"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'")


All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'
