In [1]:
import pandas as pd
import numpy as np

In [64]:
df = pd.read_csv("train.csv")

In [65]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [66]:
df

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [67]:
import pandas as pd
import re



# Drop the 'image_link' column
df = df.drop(columns=['image_link'])

# Split 'entity_value' into 'value' and 'unit'
def split_entity_value(value):
    # Regex to extract the numeric part and the unit
    match = re.match(r"([\d\.]+)\s*(\w+)", value)
    if match:
        return match.groups()
    return (None, None)

df[['value', 'unit']] = df['entity_value'].apply(split_entity_value).apply(pd.Series)

# Drop the original 'entity_value' column
df = df.drop(columns=['entity_value'])

print(df)


        group_id  entity_name  value        unit
0         748919  item_weight  500.0        gram
1         916768  item_volume    1.0         cup
2         459516  item_weight  0.709        gram
3         459516  item_weight  0.709        gram
4         731432  item_weight   1400   milligram
...          ...          ...    ...         ...
263854    558806       height    5.0  centimetre
263855    470067       height    8.5        inch
263856    204245       height   43.2  centimetre
263857    752266       height    9.1  centimetre
263858    416664       height   27.5  centimetre

[263859 rows x 4 columns]


In [69]:
len(df["unit"].unique())

34

In [6]:

df_clean = df.dropna()

# Step 2: One-hot encode the 'group_id' column
df_encoded = pd.get_dummies(df_clean, columns=['group_id'], prefix='group')



In [7]:
df_encoded = pd.get_dummies(df_encoded, columns=['unit'], prefix='unit')


In [8]:
df_encoded

Unnamed: 0,entity_name,value,group_101697,group_104874,group_106003,group_107694,group_107915,group_108478,group_110833,group_111572,...,unit_millimetre,unit_nits,unit_ounce,unit_person,unit_pint,unit_pound,unit_quart,unit_ton,unit_volt,unit_watt
0,item_weight,500.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,item_volume,1.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,item_weight,0.709,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,item_weight,0.709,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,item_weight,1400,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263854,height,5.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263855,height,8.5,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263856,height,43.2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263857,height,9.1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [70]:

# Step 3: Define the feature set (X) and target (y)
X = df_encoded.drop(columns=['entity_name'])  # Drop the target column 'unit' from features
y = df_encoded['entity_name']  # Target column 'unit'


In [71]:
X

Unnamed: 0,group_id,value,unit
0,748919,500.0,gram
1,916768,1.0,cup
2,459516,0.709,gram
3,459516,0.709,gram
4,731432,1400,milligram
...,...,...,...
263854,558806,5.0,centimetre
263855,470067,8.5,inch
263856,204245,43.2,centimetre
263857,752266,9.1,centimetre


In [74]:
y.unique()

array(['item_weight', 'item_volume', 'voltage', 'wattage',
       'maximum_weight_recommendation', 'height', 'depth', 'width'],
      dtype=object)

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the data into training and testing sets


# Step 5: Stratified train-test split to maintain class distribution in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a RandomForestClassifier
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


ValueError: could not convert string to float: 'kilogram'

In [13]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Generate the classification report (this includes precision, recall, F1-score, and support)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Compute and print additional metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' handles class imbalance
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nAdditional Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1-Score (weighted): {f1:.4f}")


Classification Report:

                               precision    recall  f1-score   support

                        depth       0.68      0.70      0.69      9025
                       height       0.65      0.67      0.66      8719
                  item_volume       1.00      0.99      1.00      1506
                  item_weight       1.00      1.00      1.00     20303
maximum_weight_recommendation       0.88      0.91      0.90       650
                      voltage       1.00      1.00      1.00      1538
                      wattage       1.00      1.00      1.00      1539
                        width       0.56      0.53      0.55      8837

                     accuracy                           0.81     52117
                    macro avg       0.85      0.85      0.85     52117
                 weighted avg       0.81      0.81      0.81     52117


Additional Metrics:
Accuracy: 0.8096
Precision (weighted): 0.8086
Recall (weighted): 0.8096
F1-Score (weighted): 0.8089


In [39]:
test_df = pd.read_csv("test_out3.csv")

In [40]:
test_df.head(10)

Unnamed: 0,index,image_link,group_id,entity_name,ocr_output,output
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,{'<OCR>': '2.63in6.68cm36in - 78in91.44cm - 19...,"2.63 in, 6.68 cm, 36.0 in, 78.0 in, 91.44 cm, ..."
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,{'<OCR>': 'One Size'},
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height,{'<OCR>': 'One Size'},
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth,{'<OCR>': 'One Size'},
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,{'<OCR>': 'One Size'},
5,5,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,height,{'<OCR>': 'One Size'},
6,6,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,width,{'<OCR>': 'One Size'},
7,7,https://m.media-amazon.com/images/I/11lshEUmCr...,156839,height,{'<OCR>': '1.4in3.56cm54in137.16cm'},"1.4 in, 3.56 cm, 54.0 in, 137.16 cm"
8,8,https://m.media-amazon.com/images/I/21+i52HRW4...,478357,width,{'<OCR>': '40 cm30 cm15 cm'},"40.0 cm, 30.0 cm, 15.0 cm"
9,9,https://m.media-amazon.com/images/I/21-LmSmehZ...,478357,height,{'<OCR>': '40 cm30 cm15 cm'},"40.0 cm, 30.0 cm, 15.0 cm"


In [41]:
test_df = test_df[["index", "group_id","entity_name", "output"]]

In [53]:
df = test_df

In [54]:

# Step 1: Define a function to split the 'output' and replicate the row
def expand_row(row):
    if pd.notna(row['output']):  # Only process rows where 'output' is not NaN
        split_values = [val.strip() for val in row['output'].split(',')]
        return pd.DataFrame({
            'index': [row['index']] * len(split_values),
            'group_id': [row['group_id']] * len(split_values),
            'entity_name': [row['entity_name']] * len(split_values),
            'output': split_values
        })
    else:
        return pd.DataFrame([row])  # Return the row as is if 'output' is NaN

# Step 2: Apply the function to each row and concatenate the results
expanded_df = pd.concat([expand_row(row) for _, row in df.iterrows()], ignore_index=True)

# Step 3: Sort the DataFrame by 'index' to maintain the original row order
expanded_df = expanded_df.sort_values('index').reset_index(drop=True)

# Display the expanded DataFrame
print(expanded_df)


         index  group_id                    entity_name    output
0            0    156839                         height   2.63 in
1            0    156839                         height   6.68 cm
2            0    156839                         height   36.0 in
3            0    156839                         height   78.0 in
4            0    156839                         height  91.44 cm
...        ...       ...                            ...       ...
435811  131283    721522  maximum_weight_recommendation  500.0 lb
435812  131284    603688                    item_weight       NaN
435813  131285    603688  maximum_weight_recommendation       NaN
435814  131286    853009                    item_weight       NaN
435815  131287    853009  maximum_weight_recommendation       NaN

[435816 rows x 4 columns]


In [55]:
expanded_df

Unnamed: 0,index,group_id,entity_name,output
0,0,156839,height,2.63 in
1,0,156839,height,6.68 cm
2,0,156839,height,36.0 in
3,0,156839,height,78.0 in
4,0,156839,height,91.44 cm
...,...,...,...,...
435811,131283,721522,maximum_weight_recommendation,500.0 lb
435812,131284,603688,item_weight,
435813,131285,603688,maximum_weight_recommendation,
435814,131286,853009,item_weight,


In [56]:
expanded_df = expanded_df.dropna()


In [57]:
import pandas as pd
import re

# Corrected function to split 'output' into 'value' and 'unit'
def split_entity_value(value):
    # Enhanced regex to handle numbers and unit with potential special characters
    match = re.match(r"([\d\.]+)\s*(\S+)", value)
    if match:
        return match.groups()
    return (None, None)  # Handle cases where regex doesn't match

# Step 1: Apply the split_entity_value function to the 'output' column
expanded_df[['value', 'unit']] = expanded_df['output'].apply(split_entity_value).apply(pd.Series)

# Step 2: Drop the original 'output' column (as it's now split into 'value' and 'unit')
expanded_df = expanded_df.drop(columns=['output'])

# Display the resulting DataFrame
print(expanded_df)


         index  group_id                    entity_name  value unit
0            0    156839                         height   2.63   in
1            0    156839                         height   6.68   cm
2            0    156839                         height   36.0   in
3            0    156839                         height   78.0   in
4            0    156839                         height  91.44   cm
...        ...       ...                            ...    ...  ...
435806  131280    695048                    item_weight    3.0   lb
435807  131280    695048                    item_weight   1.59    g
435808  131281    724618                    item_weight   90.0   lb
435809  131281    724618                    item_weight  160.0    g
435811  131283    721522  maximum_weight_recommendation  500.0   lb

[419850 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expanded_df[['value', 'unit']] = expanded_df['output'].apply(split_entity_value).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expanded_df[['value', 'unit']] = expanded_df['output'].apply(split_entity_value).apply(pd.Series)


In [78]:
expanded_df.head(30)

Unnamed: 0,index,group_id,entity_name,value,unit
0,0,156839,height,2.63,inch
1,0,156839,height,6.68,centimetre
2,0,156839,height,36.0,inch
3,0,156839,height,78.0,inch
4,0,156839,height,91.44,centimetre
5,0,156839,height,199.39,centimetre
12,7,156839,height,137.16,centimetre
13,7,156839,height,54.0,inch
14,7,156839,height,3.56,centimetre
15,7,156839,height,1.4,inch


In [76]:
len(expanded_df["unit"].unique())

20

In [58]:

# Define abbreviations for units
unit_abbreviations = {
    "centimetre": ["cm", "centimeter"],
    "foot": ["ft", "'"],
    "millimetre": ["mm", "millimeter"],
    "metre": ["m", "meter"],
    "inch": ["in", '"'],
    "yard": ["yd"],
    "milligram": ["mg"],
    "kilogram": ["kg"],
    "microgram": ["μg", "ug", "mcg"],
    "gram": ["gm", "g"],
    "ounce": ["oz"],
    "ton": ["t"],
    "pound": ["lb", "lbs"],
    "millivolt": ["mV","mv"],
    "kilovolt": ["kV", "kv"],
    "volt": ["V", "v"],
    "kilowatt": ["kW", "kw"],
    "watt": ["W", "w"],
    "cubic foot": ["ft", "cu ft"],
    "microlitre": ["μL", "uL", "microliter"],
    "cup": ["c"],
    "fluid ounce": ["fl oz"],
    "centilitre": ["cL", "centiliter"],
    "imperial gallon": ["imp gal"],
    "pint": ["pt"],
    "decilitre": ["dL", "deciliter"],
    "litre": ["L", "liter"],
    "millilitre": ["mL", "milliliter"],
    "quart": ["qt"],
    "cubic inch": [ "cu i","cu in"],
    "gallon": ["gal"]
}

# Step 1: Reverse the dictionary to map abbreviations to full names
reverse_unit_map = {abbr: full_name for full_name, abbr_list in unit_abbreviations.items() for abbr in abbr_list}

# Step 2: Replace the abbreviations in the 'unit' column using the reverse map
expanded_df['unit'] = expanded_df['unit'].map(reverse_unit_map)

# Display the resulting DataFrame
print(expanded_df)


         index  group_id                    entity_name  value        unit
0            0    156839                         height   2.63        inch
1            0    156839                         height   6.68  centimetre
2            0    156839                         height   36.0        inch
3            0    156839                         height   78.0        inch
4            0    156839                         height  91.44  centimetre
...        ...       ...                            ...    ...         ...
435806  131280    695048                    item_weight    3.0       pound
435807  131280    695048                    item_weight   1.59        gram
435808  131281    724618                    item_weight   90.0       pound
435809  131281    724618                    item_weight  160.0        gram
435811  131283    721522  maximum_weight_recommendation  500.0       pound

[419850 rows x 5 columns]


In [34]:

# Step 2: One-hot encode the 'group_id' column
expanded_df = pd.get_dummies(expanded_df, columns=['group_id'], prefix='group')
df_encoded = pd.get_dummies(expanded_df, columns=['unit'], prefix='unit')


In [35]:
df_encoded

Unnamed: 0,index,entity_name,value,group_100951,group_102234,group_103688,group_104874,group_106003,group_106902,group_107694,...,unit_quarts,unit_ton,unit_tons,unit_v,unit_w,unit_watt,unit_watts,unit_yard,unit_yd,unit_ın
0,0,height,2.63,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,height,6.68,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,height,36.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,height,78.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,height,91.44,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435806,131280,item_weight,3.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
435807,131280,item_weight,1.59,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
435808,131281,item_weight,90.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
435809,131281,item_weight,160.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:

# Step 3: Define the feature set (X) and target (y)
X = df_encoded.drop(columns=['entity_name'])  # Drop the target column 'unit' from features
y = df_encoded['entity_name']  # Target column 'unit'


In [38]:
y.unique()

array(['height', 'width', 'depth', 'maximum_weight_recommendation',
       'item_weight', 'voltage', 'item_volume', 'wattage'], dtype=object)