In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/kaggle/input/dataset/train.csv")
df

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [3]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [4]:
def convert_to_standard_unit(entity_type, value, unit):
    if(pd.isnull(value)):
        return value, unit
    
    # Define conversion factors
    length_conversions = {
        'centimetre': 1,
        'foot': 30.48,
        'inch': 2.54,
        'metre': 100,
        'millimetre': 0.1,
        'yard': 91.44
    }
    
    weight_conversions = {
        'gram': 1,
        'kilogram': 1000,
        'microgram': 1e-6,
        'milligram': 0.001,
        'ounce': 28.3495,
        'pound': 453.592,
        'ton': 1e6
    }
    
    voltage_conversions = {
        'kilovolt': 1000,
        'millivolt': 0.001,
        'volt': 1
    }
    
    wattage_conversions = {
        'kilowatt': 1000,
        'watt': 1
    }
    
    volume_conversions = {
        'centilitre': 10,
        'cubic foot': 28316.8,
        'cubic inch': 16.3871,
        'cup': 236.588,
        'decilitre': 100,
        'fluid ounce': 29.5735,
        'gallon': 3785.41,
        'imperial gallon': 4546.09,
        'litre': 1000,
        'microlitre': 0.001,
        'millilitre': 1,
        'pint': 473.176,
        'quart': 946.353,
        'ounce': 29.5735
    }
    
    # Determine the category based on entity_type
    if entity_type in ['width', 'depth', 'height']:
        category = 'length'
        conversions = length_conversions
        standard_unit = 'centimetre'
    elif entity_type in ['item_weight', 'maximum_weight_recommendation']:
        category = 'weight'
        conversions = weight_conversions
        standard_unit = 'gram'
    elif entity_type == 'voltage':
        category = 'voltage'
        conversions = voltage_conversions
        standard_unit = 'volt'
    elif entity_type == 'wattage':
        category = 'wattage'
        conversions = wattage_conversions
        standard_unit = 'watt'
    elif entity_type == 'item_volume':
        category = 'volume'
        conversions = volume_conversions
        standard_unit = 'millilitre'
    else:
        raise ValueError(f"Unknown entity type: {entity_type}")
    
    # Convert to standard unit only if it's not already in the standard unit
    if unit == standard_unit:
        return round(value, 6), standard_unit
    elif unit not in conversions:
#         print("given: ",entity_type, value, unit)
#         raise ValueError(f"Unknown unit '{unit}' for {category}")
        return value, unit
    
    converted_value = value * conversions[unit]
    
    # For volume, convert from millilitre to litre
    if category == 'volume':
        converted_value /= 1000
        standard_unit = 'litre'
    
    return converted_value, standard_unit


In [5]:
df = pd.read_csv("/kaggle/input/dataset/train.csv")
df
import pandas as pd
import re
import ast


# Drop the 'image_link' column
df = df.drop(columns=['image_link'])

# Split 'entity_value' into 'value' and 'unit'
def split_entity_value(value):
    # Regex to detect range format like "[100.0, 240.0]" or single values
    range_match = re.match(r"\[\s*([\d.]+)\s*,\s*([\d.]+)\s*\]\s*(.+)", value)
    if range_match:
        # Take the average of the range for the value, and the third group as the unit
        val = (float(range_match.group(1)) + float(range_match.group(2))) / 2
        unit = range_match.group(3)
    else:
        # Handle single value case
        value = value.split()
        val = float(value[0]) if value[0].replace('.', '', 1).isdigit() else None
        unit = ' '.join(value[1:])
    return val, unit

df[['value', 'unit']] = df['entity_value'].apply(split_entity_value).apply(pd.Series)

# Convert the 'value' column to SI units using the provided function
df['value'] = df.apply(lambda row: convert_to_standard_unit(row['entity_name'], float(row['value'] if row['value'] else 0), row['unit']), axis=1)

# Drop the original 'entity_value' column
df = df.drop(columns=['entity_value'])


# Function to split 'value' tuple into numeric value and unit
def split_value_unit(row):
    value_tuple = row['value']  # Directly access the tuple
    val = value_tuple[0]  # Extract the numeric value
    unit_from_value = value_tuple[1]  # Extract the unit from the value column
    return pd.Series([val, unit_from_value])

# Apply the split_value_unit function to create new 'value' and 'unit' columns
df[['value', 'unit_from_value']] = df.apply(split_value_unit, axis=1)

# If you want to overwrite the 'unit' column with the one from the value tuple
df['unit'] = df['unit_from_value']

# Drop the temporary 'unit_from_value' column
df = df.drop(columns=['unit_from_value'])

print(df)


        group_id  entity_name       value        unit
0         748919  item_weight  500.000000        gram
1         916768  item_volume    0.236588       litre
2         459516  item_weight    0.709000        gram
3         459516  item_weight    0.709000        gram
4         731432  item_weight    1.400000        gram
...          ...          ...         ...         ...
263854    558806       height    5.000000  centimetre
263855    470067       height   21.590000  centimetre
263856    204245       height   43.200000  centimetre
263857    752266       height    9.100000  centimetre
263858    416664       height   27.500000  centimetre

[263859 rows x 4 columns]


In [6]:
df = df.drop(columns=['unit'])
df

Unnamed: 0,group_id,entity_name,value
0,748919,item_weight,500.000000
1,916768,item_volume,0.236588
2,459516,item_weight,0.709000
3,459516,item_weight,0.709000
4,731432,item_weight,1.400000
...,...,...,...
263854,558806,height,5.000000
263855,470067,height,21.590000
263856,204245,height,43.200000
263857,752266,height,9.100000


In [7]:


# List of entity names to delete
entity_names_to_delete = ['item_weight', 'item_volume', 'voltage', 'wattage', 'maximum_weight_recommendation']

# Filter the DataFrame to keep rows where entity_name is NOT in the list
df = df[~df['entity_name'].isin(entity_names_to_delete)]

# Print the updated DataFrame
print(df)
df_clean = df.dropna()
# Step 2: One-hot encode the 'group_id' column
df_encoded = pd.get_dummies(df_clean, columns=['group_id'], prefix='group')



        group_id entity_name  value
536       442321      height  95.00
670       630869       depth  21.00
977       630869       depth  20.50
1740      507467      height   4.65
2306      237000      height  71.00
...          ...         ...    ...
263854    558806      height   5.00
263855    470067      height  21.59
263856    204245      height  43.20
263857    752266      height   9.10
263858    416664      height  27.50

[132907 rows x 3 columns]


In [8]:
df_encoded

Unnamed: 0,entity_name,value,group_101697,group_104874,group_106003,group_107694,group_107915,group_108478,group_110833,group_111572,...,group_992107,group_992707,group_993498,group_994005,group_995106,group_995200,group_995842,group_997176,group_997333,group_998545
536,height,95.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
670,depth,21.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
977,depth,20.50,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1740,height,4.65,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2306,height,71.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263854,height,5.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263855,height,21.59,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263856,height,43.20,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263857,height,9.10,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:

# Step 3: Define the feature set (X) and target (y)
X = df_encoded.drop(columns=['entity_name'])  # Drop the target column 'unit' from features
y = df_encoded['entity_name']  # Target column 'unit'


In [10]:
X

Unnamed: 0,value,group_101697,group_104874,group_106003,group_107694,group_107915,group_108478,group_110833,group_111572,group_113134,...,group_992107,group_992707,group_993498,group_994005,group_995106,group_995200,group_995842,group_997176,group_997333,group_998545
536,95.00,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
670,21.00,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
977,20.50,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1740,4.65,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2306,71.00,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263854,5.00,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263855,21.59,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263856,43.20,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263857,9.10,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
y.unique()

array(['height', 'depth', 'width'], dtype=object)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import pickle

def train_and_evaluate_models(X, y):
    # Step 1: Stratified train-test split to maintain class distribution in training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Step 2: Define the models, including Neural Network and Naive Bayes
    models = {
#         'Random Forest': RandomForestClassifier(),
#         'Gradient Boosting': GradientBoostingClassifier(),
#         'SVM': SVC(),
#         'Logistic Regression': LogisticRegression(max_iter=1000),
#         'Decision Tree': DecisionTreeClassifier(),
        'Neural Network (MLP)': MLPClassifier(max_iter=1000),
#         'Naive Bayes': GaussianNB()
    }

    # Step 3: Train and evaluate each model
    for name, model in models.items():
        print(f"Training and evaluating {name}...")

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate the model
        print(f"Results for {name}:")
        print(classification_report(y_test, y_pred))
#         model_filename = f"Neural Network.pkl"  # e.g., 'Random_Forest.pkl'
        with open({name}, 'wb') as file:
            pickle.dump(model, file)
        print("-" * 80)

# Example usage:
# Assuming X and y are already defined in your environment as feature matrix and target variable respectively
train_and_evaluate_models(X, y)
# Training and evaluating Random Forest...
# Results for Random Forest:
#               precision    recall  f1-score   support

#        depth       0.68      0.71      0.69      9026
#       height       0.66      0.69      0.67      8719
#        width       0.58      0.54      0.56      8837

#     accuracy                           0.64     26582
#    macro avg       0.64      0.64      0.64     26582
# weighted avg       0.64      0.64      0.64     26582

# --------------------------------------------------------------------------------
# Training and evaluating Gradient Boosting...
# Results for Gradient Boosting:
#               precision    recall  f1-score   support

#        depth       0.52      0.72      0.61      9026
#       height       0.55      0.51      0.53      8719
#        width       0.50      0.34      0.41      8837

#     accuracy                           0.53     26582
#    macro avg       0.53      0.53      0.52     26582
# weighted avg       0.53      0.53      0.52     26582



Training and evaluating Neural Network (MLP)...
Results for Neural Network (MLP):
              precision    recall  f1-score   support

       depth       0.67      0.80      0.73      9026
      height       0.69      0.68      0.68      8719
       width       0.63      0.52      0.57      8837

    accuracy                           0.66     26582
   macro avg       0.66      0.66      0.66     26582
weighted avg       0.66      0.66      0.66     26582



TypeError: expected str, bytes or os.PathLike object, not set

In [16]:
model

NameError: name 'model' is not defined

In [13]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Generate the classification report (this includes precision, recall, F1-score, and support)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Compute and print additional metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' handles class imbalance
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nAdditional Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1-Score (weighted): {f1:.4f}")


Classification Report:



NameError: name 'y_test' is not defined

In [None]:
test_df = pd.read_csv("/content/processed_chunk_0_output (1).csv")

In [None]:
test_df

In [None]:
test_df = test_df[["index", "group_id","entity_name", "extracted_value"]]

In [None]:
test_df