In [28]:
import pandas as pd

from src.path import DataPaths
from src.merge import merge_dataframe_rows

from tools.type_check import print_detailed_info
from tools.save_data import export_dataframe

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_parquet(DataPaths.file_parquet_clean)
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 31
Data columns (total 31 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   unspsc                               str             Sewing and stitchery and weaving equipme...
 1   root_domain                          str             studio-atcoat.com
 2   page_url                             str             https://studio-atcoat.com/1372696759/?id...
 3   product_title                        str             Glimakra Warping Board (8m)
 4   product_summary                      str             The Glimakra Warping Board is designed f...
 5   product_name                         str             Warping Board
 6   product_identifier                   ndarray(0,)     []
 7   brand                                str             CST
 8   intended_industries                  ndarray(1,)     [

In [29]:
df_copy = df[['product_title', 'product_summary', 'description']]
df_copy.head(10)

Unnamed: 0,product_title,product_summary,description
0,Glimakra Warping Board (8m),The Glimakra Warping Board is designed for use...,"The ""Warping Board"" is designed for use with f..."
1,NMRV Worm Gearbox Motor,The NMRV Worm Gearbox Motor is a high-efficien...,"The ""Worm Gearbox Motor"" is a high-efficiency ..."
2,Nissan R33 GTR Car Cover,A custom car cover designed for the Nissan R33...,"The ""Car Cover"" is a custom-designed cover tai..."
3,Flexible Fittings,"Flexible fittings for plumbing applications, a...","""Flexible Fittings"" are designed for plumbing ..."
4,CST-HGD-33103 Hinged Closet Door,The CST-HGD-33103 Hinged Closet Door is a meti...,"The ""Hinged Closet Door"" is a storage solution..."
5,Deep Faucets,"Faucets with a deep design, providing a secure...","""Deep Faucets"" are designed with a deep design..."
6,10K Dry Shut-Off Gun Handle Assembly,The 10K Dry Shut-Off Gun Handle Assembly is a ...,"The ""Dry Shut-Off Gun Handle Assembly"" is a co..."
7,Cranberry Cart Systems,Cranberry Cart Systems are part of the Armstro...,"""Cranberry Cart Systems"" from the Armstrong Me..."
8,10K Air Operated Control Gun,An air operated control gun designed for water...,"The ""Air Operated Control Gun"" is designed for..."
9,5 THINGS YOU SHOULD KNOW ABOUT MY NURSE PRACTI...,A long sleeve tee with the message '5 THINGS Y...,"The ""Long Sleeve Tee"" is a long sleeve t-shirt..."


In [30]:
def calculate_similarity_scores(df, method='cosine'):
    """
    Calculate similarity scores between product_summary and description columns.

    Parameters:
    df (pandas.DataFrame): DataFrame containing 'product_summary' and 'description' columns
    method (str): Similarity method to use ('cosine', 'jaccard', 'sequence')

    Returns:
    pandas.DataFrame: Original DataFrame with an additional 'similarity_score' column
    """
    result_df = df.copy()

    # Ensure both columns are strings
    result_df['product_summary'] = result_df['product_summary'].fillna('').astype(str)
    result_df['description'] = result_df['description'].fillna('').astype(str)

    similarity_scores = []

    if method == 'cosine':
        # Cosine similarity method
        for _, row in result_df.iterrows():
            if not row['product_summary'] or not row['description']:
                similarity_scores.append(0.0)
                continue

            # Create vectors
            vectorizer = CountVectorizer().fit_transform([row['product_summary'], row['description']])
            vectors = vectorizer.toarray()

            # Calculate cosine similarity
            if vectors.shape[0] < 2 or vectors[0].sum() == 0 or vectors[1].sum() == 0:
                similarity_scores.append(0.0)
            else:
                similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
                similarity_scores.append(similarity)

    # Convert similarity to percentage
    result_df['similarity_percentage'] = [round(score * 100, 2) for score in similarity_scores]

    return result_df

# Assuming your dataframe with the three columns is named 'product_df'
result_df = calculate_similarity_scores(df, method='cosine')
average_similarity = result_df['similarity_percentage'].mean()
# Display the results
print(f"Average similarity between product summaries and descriptions: {average_similarity:.2f}%")

Average similarity between product summaries and descriptions: 69.51%


In [31]:
df_copy = df[['product_title', 'product_name']]
df_copy.head(10)

Unnamed: 0,product_title,product_name
0,Glimakra Warping Board (8m),Warping Board
1,NMRV Worm Gearbox Motor,Worm Gearbox Motor
2,Nissan R33 GTR Car Cover,Car Cover
3,Flexible Fittings,Flexible Fittings
4,CST-HGD-33103 Hinged Closet Door,Hinged Closet Door
5,Deep Faucets,Deep Faucets
6,10K Dry Shut-Off Gun Handle Assembly,Dry Shut-Off Gun Handle Assembly
7,Cranberry Cart Systems,Cranberry Cart Systems
8,10K Air Operated Control Gun,Air Operated Control Gun
9,5 THINGS YOU SHOULD KNOW ABOUT MY NURSE PRACTI...,Long Sleeve Tee


In [32]:
def calculate_title_name_similarity(df):
    """
    Calculate cosine similarity between product_title and product_name columns.

    Parameters:
    df (pandas.DataFrame): DataFrame containing 'product_title' and 'product_name' columns

    Returns:
    pandas.DataFrame: Original DataFrame with an additional 'similarity_percentage' column
    """
    result_df = df.copy()

    # Ensure both columns are strings
    result_df['product_title'] = result_df['product_title'].fillna('').astype(str)
    result_df['product_name'] = result_df['product_name'].fillna('').astype(str)

    similarity_scores = []

    for _, row in result_df.iterrows():
        if not row['product_title'] or not row['product_name']:
            similarity_scores.append(0.0)
            continue

        # Create vectors
        vectorizer = CountVectorizer().fit_transform([row['product_title'], row['product_name']])
        vectors = vectorizer.toarray()

        # Calculate cosine similarity
        if vectors.shape[0] < 2 or vectors[0].sum() == 0 or vectors[1].sum() == 0:
            similarity_scores.append(0.0)
        else:
            similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]
            similarity_scores.append(similarity)

    # Convert similarity to percentage
    result_df['similarity_percentage'] = [round(score * 100, 2) for score in similarity_scores]

    # Calculate the average similarity
    average_similarity = result_df['similarity_percentage'].mean()
    print(f"Average similarity between product_title and product_name: {average_similarity:.2f}%")

    return result_df

# Example usage:
df_with_similarity = calculate_title_name_similarity(df_copy)

result_df = calculate_title_name_similarity(df)
average_similarity = result_df['similarity_percentage'].mean()
# Display the results
print(f"Average similarity between product summaries and descriptions: {average_similarity:.2f}%")

Average similarity between product_title and product_name: 71.54%
Average similarity between product_title and product_name: 71.54%
Average similarity between product summaries and descriptions: 71.54%


In [33]:
# Filter rows where manufacturing_year is not -1
other_values = df[df['manufacturing_year'] != -1]

# Display the results
print(f"Number of rows where manufacturing_year is not -1: {len(other_values)}")

if len(other_values) > 0:
    # See what other values exist
    unique_years = other_values['manufacturing_year'].unique()
    print(f"Unique manufacturing year values other than -1: {unique_years}")

    # Optional: Show the distribution of these values
    year_counts = other_values['manufacturing_year'].value_counts().sort_index()
    print("\nDistribution of manufacturing years:")
    print(year_counts)
else:
    print("All manufacturing_year values are -1")

Number of rows where manufacturing_year is not -1: 0
All manufacturing_year values are -1
