### Cleanup Items

* Get all csv files in '/dev/clean_data folder'
* For each file retain only one-hot encoded columns in each category
* Merge to all categories to itemset

In [3]:
import os
import pandas as pd
folder_path = os.path.join(os.getcwd(), os.pardir)+ '/dataset/clean_data'

save_to_dir = "../dataset/utility/items"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

In [4]:
def get_csv_files(directory):
    """
    Get all csv files in a directory
    """
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

def get_itemset():
    """
    Return merged itemset df across all categories
    """
    files = get_csv_files(folder_path)
    df_itemset = pd.DataFrame()

    for file in files:
        if 'reviewers' not in file.lower():
            filtered_df = pd.DataFrame()
            category = os.path.basename(file).split('_')[1].split('.')[0].lower()
            print('Merging :',  category)
            df = pd.read_csv(file, low_memory=False)
            df['category'] = category
            # Keep only 'ASIN' and one hot encoded columns
            columns_to_keep = ['ASIN','category'] + [col for col in df.columns if col != 'ASIN' and
                                        all(df[col].dropna().unique() == [1])]
            filtered_df = df[columns_to_keep]
            # Merge filtered df to df itemset
            if df_itemset.empty:
                # Direct assignment for the first DataFrame
                df_itemset = filtered_df
            else:
                # Merge filtered df to the itemset on common columns
                df_itemset = pd.merge(df_itemset, filtered_df, how='outer')
    df_itemset = df_itemset.drop_duplicates()
    return df_itemset
df_itemset = get_itemset()
df_itemset

Merging : bedroom
Merging : computer components
Merging : bathroom
Merging : cleaning material
Merging : car stuff
Merging : mobile accessories
Merging : fashion
Merging : books
Merging : peripheral devices
Merging : electronic devices
Merging : personal care
Merging : office supplies
Merging : travel essentials
Merging : kitchen
Merging : children
Merging : living room


Unnamed: 0,ASIN,category,Home & Kitchen,Bedding,Comforters & Sets,Comforter Sets,Kids' Bedding,Baby Products,Nursery,Toddler Bedding,...,Torches,Pest Control,Bug Zappers,Bistro Sets,Outdoor Curtains,Patio Furniture Covers,Furniture Set Covers,Figurine Lights,Storage Benches,Boot & Shoe Boxes
0,B0CMSW6JNM,bedroom,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,,
1,B0CH9Y8CBQ,bedroom,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,,
2,B0CC6N7XR9,bedroom,1.0,1.0,1.0,1.0,1.0,,,,...,,,,,,,,,,
3,B0B4ZCYXMX,bedroom,,1.0,,,,1.0,1.0,1.0,...,,,,,,,,,,
4,B0B2DG8QRL,bedroom,1.0,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34614,B07QXK9WG5,living room,1.0,,,,,,,,...,,,,,,,,,,
34615,B0CHXTR17D,living room,1.0,,,,,,,,...,,,,,,,,,,
34616,B0C9MZWQ1D,living room,1.0,,,,,,,,...,,,,,,,,,,
34617,B0CLRHN99R,living room,1.0,,,,,,,,...,,,,,,,,,,


There are items that belong to multiple categories:

In [5]:
duplicates = df_itemset[df_itemset.duplicated('ASIN', keep=False)]
df_categories = duplicates.groupby('ASIN')['category'].apply(lambda x: ', '.join(x.unique())).reset_index(name='Categories')
df_categories

Unnamed: 0,ASIN,Categories
0,B00A6V1H2S,"bathroom, fashion"
1,B00APVXSM6,"bedroom, children"
2,B00BEUDWJQ,"bedroom, cleaning material, personal care, liv..."
3,B00HSC9F2C,"personal care, children"
4,B00KUPS3JU,"office supplies, living room"
...,...,...
182,B0CS4CP75C,"office supplies, living room"
183,B0CSSRBG48,"office supplies, living room"
184,B0CT2CC1YY,"office supplies, living room"
185,B0CTH3XT3D,"office supplies, living room"


### Merge ASINs with more than 1 row

In [6]:
df_unique = df_itemset.copy()
df_unique.set_index(["ASIN"], inplace=True)
df_unique.drop( ["category"], inplace=True, axis=1)

In [7]:
import numpy as np

dupe_asins = df_categories.set_index(["ASIN"]).index.tolist()
feats = df_unique.columns.tolist()
for asin in dupe_asins:
    print(f"processing {asin}")
    asin_indiv_df = df_unique.loc[asin]
    indices, series_list = zip(*asin_indiv_df.iterrows())

    ind = indices[0]
    final_series = series_list[0]
    for i in range(1, len(series_list)):
        for f in feats:
            if ~np.isnan(series_list[i][f]):
                final_series[f] = 1.0
    
    asin_indiv_df_filtered = pd.DataFrame(final_series)
    df_unique.drop(index=asin, inplace=True)
    df_unique = pd.concat([asin_indiv_df_filtered.T, df_unique])
display(df_unique)

processing B00A6V1H2S
processing B00APVXSM6
processing B00BEUDWJQ
processing B00HSC9F2C
processing B00KUPS3JU
processing B00R92CL5E
processing B00WAIQA7C
processing B01AKWNMJI
processing B01ER18L58
processing B01GIJBK50
processing B01H6GUCCQ
processing B01N6KTU29
processing B072K59NYZ
processing B074W7SKZ2
processing B077S6KQ88
processing B07B7K7N3P
processing B07BDFW1Y7
processing B07D1XCKWW
processing B07D1XJH2W
processing B07GKVBQDG
processing B07GW9TJ3G
processing B07HYX9P88
processing B07JMKB2GQ
processing B07KJYY9BD
processing B07ND3WR64
processing B07NM4MRRL
processing B07PGLBCFG
processing B07RS1G6XW
processing B07RW6Z692
processing B07SDLR1BC
processing B07SS4GMLW
processing B07V4R4Z77
processing B07WC7QWWW
processing B07WTS8T2W
processing B07X6C9RMF
processing B07Y8BXBX8
processing B07YLJCW2P
processing B07Z5NQPDS
processing B07ZVC6DMM
processing B081H3Y5NW
processing B086ZLGLV3
processing B087CDBKCH
processing B088NHGC48
processing B088T2KNZ4
processing B088YKV9FW
processing

Unnamed: 0,Home & Kitchen,Bedding,Comforters & Sets,Comforter Sets,Kids' Bedding,Baby Products,Nursery,Toddler Bedding,Bedding Sets,Blankets & Throws,...,Torches,Pest Control,Bug Zappers,Bistro Sets,Outdoor Curtains,Patio Furniture Covers,Furniture Set Covers,Figurine Lights,Storage Benches,Boot & Shoe Boxes
B0CTM6P5TW,,,,,,,,,,,...,,,,,,,,,,
B0CTH3XT3D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CT2CC1YY,1.0,,,,,,,,,,...,,,,,,,,,,
B0CSSRBG48,1.0,,,,,,,,,,...,,,,,,,,,,
B0CS4CP75C,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B07QXK9WG5,1.0,,,,,,,,,,...,,,,,,,,,,
B0CHXTR17D,1.0,,,,,,,,,,...,,,,,,,,,,
B0C9MZWQ1D,1.0,,,,,,,,,,...,,,,,,,,,,
B0CLRHN99R,1.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
df_unique.to_csv("../dataset/utility/items/itemset.csv")

### Exploratory Data Analysis

In [9]:
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

# Count the frequency of each category
category_counts = df_itemset['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']

# Sort categories by count for a consistent gradient effect
category_counts.sort_values('count', ascending=True, inplace=True)

# Normalize count values for color mapping
count_normalized = (category_counts['count'] - category_counts['count'].min()) / (category_counts['count'].max() - category_counts['count'].min())

# Manually interpolate colors from Viridis scale
viridis_scale = px.colors.sequential.Aggrnyl
colors = [viridis_scale[int(np.floor(x * (len(viridis_scale) - 1)))] for x in count_normalized]

# Create a horizontal bar chart
fig = go.Figure()

# Add bars with mapped colors
for i, row in category_counts.iterrows():
    fig.add_trace(go.Bar(
        x=[row['count']],
        y=[row['category']],
        orientation='h',
        marker=dict(color=colors[i]),  # Use the manually mapped color
        name=row['category']
    ))

# Update layout
fig.update_layout(
    title="Frequency Distribution of Categories",
    xaxis_title="Count",
    yaxis_title="Category",
    yaxis=dict(categoryorder='total ascending'),  # Ensure correct order after sorting
    showlegend=False
)

fig.show()


In [10]:
df_features = df_itemset.drop(['ASIN','category'], axis = 1)
feature_frequencies = df_features.sum()

In [11]:
# Count the frequency of each feature
category_counts = feature_frequencies[:15].reset_index()
category_counts.columns = ['features', 'count']

# Sort feature by count for a consistent gradient effect
category_counts.sort_values('count', ascending=True, inplace=True)

# Normalize count values for color mapping
count_normalized = (category_counts['count'] - category_counts['count'].min()) / (category_counts['count'].max() - category_counts['count'].min())

# Manually interpolate colors from Viridis scale
viridis_scale = px.colors.sequential.Viridis
colors = [viridis_scale[int(np.floor(x * (len(viridis_scale) - 1)))] for x in count_normalized]

# Create a horizontal bar chart
fig = go.Figure()

# Add bars with mapped colors
for i, row in category_counts.iterrows():
    fig.add_trace(go.Bar(
        x=[row['count']],
        y=[row['features']],
        orientation='h',
        marker=dict(color=colors[i]),  # Use the manually mapped color
        name=row['features']
    ))

# Update layout
fig.update_layout(
    title="Top 10 Features Across All Items",
    xaxis_title="Count",
    yaxis_title="Feature",
    yaxis=dict(categoryorder='total ascending'),  # Ensure correct order after sorting
    showlegend=False
)

fig.show()

In [12]:
# Sum across rows to find how many feature each item has
df_features_count = df_features.copy()
df_features_count['Feature_Count'] = df_features.sum(axis=1)

# Analyze the distribution of category counts
feature_count_distribution = df_features_count['Feature_Count'].value_counts().sort_index()

In [13]:
fig = px.histogram(df_features_count, x='Feature_Count',
                   title='Distribution of Feature count of Items',
                   labels={'Feature_Count': 'Number of Features'},
                   text_auto=True,  # Automatically add text labels on bars
                   category_orders={"Category_Count": sorted(df_features_count['Feature_Count'].unique())})  # Ensure ordered categories

# Customize layout
fig.update_layout(
    xaxis=dict(title='Feature count of Items', tickmode='linear'),
    yaxis_title='Count of Items',
    bargap=0.2,  # Adjust the gap between bars
)

# Customize the bars to show the exact count above them
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)

fig.show()

In [14]:
df_cleaned = df_cleaned.apply(lambda x: pd.to_numeric(x, errors='ignore'))

# Check dtypes to confirm conversion
df_cleaned.dtypes
# Check dtypes to confirm conversion
print(df_cleaned.dtypes)

# Analyze feature frequency (summing each one-hot encoded column to get the frequency)
feature_frequencies = df_cleaned.drop(columns=['ASIN', 'category']).sum().sort_values(ascending=False)

# Identify the top 15 most common features
top_15_features = feature_frequencies.head(15).index

# Create a co-occurrence matrix for the top 15 features
df_top_15_features = df_cleaned[top_15_features]

# Calculate the co-occurrence matrix for these top features
co_occurrence_matrix_top_15 = np.dot(df_top_15_features.T, df_top_15_features)

# Convert to a DataFrame for easier handling
co_occurrence_df_top_15 = pd.DataFrame(co_occurrence_matrix_top_15, 
                                       index=top_15_features, 
                                       columns=top_15_features)

# Visualize the co-occurrence matrix as a heatmap using Plotly
fig = px.imshow(co_occurrence_df_top_15,
                labels=dict(x="Feature", y="Feature", color="Co-occurrence"),
                x=top_15_features,
                y=top_15_features,
                title="Heatmap of Feature Co-occurrence for Top 15 Features")
fig.update_xaxes(side="top")
fig.show()



NameError: name 'df_cleaned' is not defined