In [20]:
import pandas as pd

from tools.save_data import export_dataframe
from tools.type_check import print_detailed_info
from src.path import DataPaths


In [21]:
df = pd.read_parquet(DataPaths.file_parquet_original)
df = df[['product_title', 'description', 'product_summary']]
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 3
Data columns (total 3 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   product_title                        str             Glimakra Warping Board (8m)
 1   description                          str             The "Warping Board" is designed for use ...
 2   product_summary                      str             The Glimakra Warping Board is designed f...


In [22]:
def merge_and_drop_descriptions(df):
    """
    Merge description and product_summary columns based on the longest string,
    modifying the DataFrame in place.

    Args:
        df: DataFrame with product_title, description, and product_summary columns

    Returns:
        None (modifies the DataFrame in place)
    """
    # Create a new column with the longest text between description and product_summary
    df['product_description'] = df.apply(
        lambda row: row['description'] if len(str(row['description'])) >= len(str(row['product_summary']))
                   else row['product_summary'],
        axis=1
    )

    # Handle any None/NaN values
    df['product_description'] = df['product_description'].fillna('')

    # Drop the original description and product_summary columns
    df.drop(['description', 'product_summary'], axis=1, inplace=True)

    return df  # Return the modified DataFrame for convenience

In [23]:
merge_and_drop_descriptions(df)
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 2
Data columns (total 2 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   product_title                        str             Glimakra Warping Board (8m)
 1   product_description                  str             The Glimakra Warping Board is designed f...


In [24]:
export_dataframe(df, DataPaths.test_folder, "test_description", file_format="parquet")

Exported data to: E:\veridion_deduplication\data\test\test_description.snappy.parquet


WindowsPath('E:/veridion_deduplication/data/test/test_description.snappy.parquet')