In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [65]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [53]:
# Read the data from the csv as a pandas dataframe
data = pd.read_csv('/kaggle/input/10000-records/63210b83-9b6e-46d4-8543-b013295fa2f9.csv', encoding='ISO-8859-1', on_bad_lines='skip')

In [54]:
# Original data features
data.columns

Index(['scannable_id', 'marketplace_id', 'fulfillment_shipment_id',
       'ship_date', 'ship_option', 'origin', 'items', 'item_count',
       'slam_event', 'shipment_events', 'status'],
      dtype='object')

In [55]:
# One row or record in the dataset
data.loc[1]

scannable_id                                                SB2RMLG6BF_001_v
marketplace_id                                                         44571
fulfillment_shipment_id                                      385667322702815
ship_date                                            2024-11-10 11:00:00.000
ship_option                                              Std IN EZ Metro COD
origin                     {"warehouse_id":"43VHCDQMVAGH37FPVWGAG129HM6W1...
items                      [{"asin":"B0CY8FTBQK","marketplace_id":44571,"...
item_count                                                                 1
slam_event                 {"event_date":"2024-11-10 05:15:48.234","notif...
shipment_events            [{"id":"EVENT_301-D1-FP-FRONT_PORCH_OR_FRONT_D...
status                                                         Non-compliant
Name: 1, dtype: object

In [56]:
# Original data dimensions
data.shape

(10000, 11)

In [57]:
# copy of the data for future manipulations
df = pd.DataFrame(data)

# Few of the features have data in dictionaries and list of dictionaries.
# But the dictionaries are in text format.
# Converting the text into dictionaries and then creating new columns.
# The key postfixed with the column name becomes the new column, 
# ex: col = {key_1:value, key, key_2:value}, this single entry converts into two two new cols
# col_key_1, col_key_2. The 'col' feature will be deleted from the original data



# Safely load JSON from strings or handle already-parsed data
def safe_json_loads(value):
    if isinstance(value, (list, dict)):  # Already parsed or if not a dictionary
        return value
    try:
        return json.loads(value)  # Parse stringified dictionary
    except (json.JSONDecodeError, TypeError):
        return None

# Function to expand list entries horizontally
def expand_list_horizontally(column, prefix):
    # Process one column at a time
    # Finding the max cols that can be created from the given column 

    # Check if the column is a list of dictionaries
    max_length = column.apply(lambda x: len(x) if isinstance(x, list) else 0).max()
    expanded_data = pd.DataFrame(
        column.apply(lambda x: x if isinstance(x, list) else [None] * max_length).tolist(),
        # Giving each column name a unique name by appending with a number
        columns=[f"{prefix}_{i+1}" for i in range(max_length)]
    )
    # Normalize nested dictionaries in each expanded column
    # If there are nested dictionaries then expand them further
    # Concatenate each column at the end.
    for col in expanded_data.columns:
        # The json_normalize function is used for flattening complex, nested JSON structures into a tabular format.
        expanded_data = pd.concat(
            [expanded_data, pd.json_normalize(expanded_data[col]).add_prefix(f"{col}_")],
            axis=1
        ).drop(columns=[col])
    return expanded_data

# Process `items` column (expand horizontally as specified above)
items_expanded = expand_list_horizontally(df['items'].apply(safe_json_loads), 'items')

# Process `shipment_events` column (expand horizontally as specified above)
shipment_events_expanded = expand_list_horizontally(df['shipment_events'].apply(safe_json_loads), 'shipment_events')

# Process other columns (e.g., `origin` and `slam_event`)
origin_flat = pd.json_normalize(df['origin'].apply(safe_json_loads)).add_prefix('origin_')
slam_event_flat = pd.json_normalize(df['slam_event'].apply(safe_json_loads)).add_prefix('slam_event_')

# Combine expanded data with the original DataFrame (dropping old processed columns which have dictionaries in them)
df_flattened = pd.concat([
    df.drop(columns=['origin', 'items', 'slam_event', 'shipment_events']),
    origin_flat,
    slam_event_flat,
    items_expanded,
    shipment_events_expanded
], axis=1)

# Displaying the flattened DataFrame
print(df_flattened)

          scannable_id  marketplace_id  fulfillment_shipment_id  \
0     SU3D6tmBqW_001_v           44571           95247083013202   
1     SB2RMLG6BF_001_v           44571          385667322702815   
2     SU1fjs1yGW_001_v           44571           95459826990202   
3     SBFq2q4N8D_001_v           44571          386454219702815   
4     SUNbSL3SHW_001_v           44571           95539395380202   
...                ...             ...                      ...   
9995       spQhl7dD6J5               1          368497290250201   
9996       spQs491th45               1          368473926805201   
9997       spRvK4dH1xD               1          368508832637201   
9998       spNF4szQ5Rw               1          368431508390201   
9999  SBYSLgJM8F_001_v           44571          387205086002815   

                    ship_date               ship_option  item_count  \
0     2024-11-08 04:17:45.284                std-in-10k           1   
1     2024-11-10 11:00:00.000       Std IN EZ Metro C

In [58]:
# Flattened data dimensions
df_flattened.shape

(10000, 614)

In [59]:
# New column names
df_flattened.columns

Index(['scannable_id', 'marketplace_id', 'fulfillment_shipment_id',
       'ship_date', 'ship_option', 'item_count', 'status',
       'origin_warehouse_id', 'origin_country_code', 'origin_org_unit',
       ...
       'shipment_events_1_responses.dangerous_goods.packaging_option.packaging_measurements.package_total_weight.value',
       'shipment_events_1_responses.dangerous_goods.packaging_option.packaging_measurements.package_total_weight.unit',
       'shipment_events_1_responses.dangerous_goods.packaging_option.packaging_measurements.cube_usage',
       'shipment_events_1_responses.dangerous_goods.packaging_option.packaging_measurements',
       'shipment_events_1_responses.medical_devices.rx_devices.status',
       'shipment_events_1_responses.medical_devices.rx_devices.evaluation_date',
       'shipment_events_1_responses.medical_devices.rx_devices.warehouse_id',
       'shipment_events_1_responses.medical_devices.rx_devices.authorized',
       'shipment_events_1_responses.medical

In [60]:
# Separate features and target
target_column = 'status'  
X = df_flattened.drop(columns=[target_column])
y = df_flattened[target_column]

# Handle missing values (fill with mean for numerical)
X = X.fillna(X.mean(numeric_only=True)).fillna('Unknown')  # Fill missing values
y = y.fillna('Unknown')

# Handle List Columns by converting them to strings 
def handle_mixed_types(df):
    for col in df.columns:
        # If the column contains lists, convert lists to strings
        if df[col].apply(lambda x: isinstance(x, list)).any():
            df[col] = df[col].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
        # If the column contains booleans, convert them to integers (0/1)
        elif df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
    return df

X = handle_mixed_types(X)

# Encode categorical columns using LabelEncoder
# Used to encode target labels with value between 0 and n_classes-1.
categorical_cols = X.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    # Ensure column is treated as string before encoding
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    # Transforming as per the label encoder functionality
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numerical features
# It ensures that features with different ranges contribute equally to the model's learning process.
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
# Feature Selection

# Correlation-based Selection
# Correlation-based feature selection is a technique used to identify and 
# select relevant features in a dataset by analyzing the correlation between 
# features and the target variable. 
# The goal is to reduce the dimensionality of the dataset 
# while retaining the most informative features.

# Create a Correlation Matrix
correlation_matrix = pd.DataFrame(X_train, columns=X.columns).corr()
# Identifies features that have a correlation coefficient greater than 0.9 with at least one other feature.
high_corr_features = correlation_matrix.columns[
    (correlation_matrix.abs() > 0.9).any() & (correlation_matrix.columns != target_column)
]

# The identified highly correlated features are dropped from both the training and testing datasets.
# This helps to reduce multicollinearity
X_train_corr = X_train.drop(columns=high_corr_features)
X_test_corr = X_test.drop(columns=high_corr_features)



# Output Selected Features
print("Correlation-based Removed Features:", high_corr_features.tolist())



Correlation-based Removed Features: ['scannable_id', 'marketplace_id', 'fulfillment_shipment_id', 'ship_date', 'ship_option', 'item_count', 'origin_warehouse_id', 'origin_country_code', 'origin_org_unit', 'origin_region', 'origin_warehouse_management_system', 'origin_warehouse_node_types', 'slam_event_event_date', 'slam_event_notification_date', 'slam_event_processing_date', 'slam_event_packaging_option_identifier', 'slam_event_packaging_option_type', 'slam_event_battery_statements', 'slam_event_unid_statements', 'slam_event_pack_labels', 'slam_event_items', 'slam_event_scale_weight.value', 'slam_event_scale_weight.unit', 'slam_event_ship_plan.uber.source.id', 'slam_event_ship_plan.uber.source.postal_code', 'slam_event_ship_plan.uber.source.country_code', 'slam_event_ship_plan.uber.destination.postal_code', 'slam_event_ship_plan.uber.destination.country_code', 'slam_event_ship_plan.uber.ship_carrier_id', 'slam_event_ship_plan.uber.ship_method', 'slam_event_ship_plan.uber.ship_method_gr

In [69]:
# Train the model
# A Random Forest Classifier is a ensemble learning method that combines multiple decision trees 
# to improve prediction accuracy and reduce overfitting.
model = RandomForestClassifier(random_state=42)
model.fit(X_train_corr, y_train)

# Make predictions
y_pred = model.predict(X_test_corr)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model: {accuracy * 100:.2f}%')

Accuracy of the model: 82.10%
