In [None]:
!pip install pandas scikit-learn keras




# Section 1: Run classification on Text only

## Section 1.1: Run ANN separately for product-category and hazard-category



In [None]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
from keras.initializers import RandomNormal
from sklearn.metrics import confusion_matrix, classification_report
from keras.optimizers import Adam


# Load your dataset from the CSV file
df = pd.read_csv('/content/final_cleaned_train.csv')

# Combine 'title' and 'text' for classification
df['combined_text'] =  df['text']

# Clean the text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

# Count Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text']).toarray()  # Convert to dense array
y_hazard = df['hazard'].values  # Use hazard for labels
y_product = df['product'].values  # Use product for labels

# Encode labels
label_encoder_hazard = LabelEncoder()
y_hazard_encoded = label_encoder_hazard.fit_transform(y_hazard)
y_hazard_categorical = to_categorical(y_hazard_encoded)

label_encoder_product = LabelEncoder()
y_product_encoded = label_encoder_product.fit_transform(y_product)
y_product_categorical = to_categorical(y_product_encoded)

# Split the dataset into training and testing sets
X_train, X_test, y_hazard_train, y_hazard_test, y_product_train, y_product_test = train_test_split(
    X, y_hazard_categorical, y_product_categorical, test_size=0.2, random_state=42
)

# Create the ANN model
input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05))(input_layer)
hidden_layer = Dense(32, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05))(hidden_layer)

# Output layers for hazard and product categories
hazard_output = Dense(y_hazard_categorical.shape[1], activation='softmax', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05), name='hazard_output')(hidden_layer)
product_output = Dense(y_product_categorical.shape[1], activation='softmax', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05), name='product_output')(hidden_layer)

# Define the model
model = Model(inputs=input_layer, outputs=[hazard_output, product_output])


# Change learning rate to 0.0001
custom_learning_rate = 0.01
optimizer = Adam(learning_rate=custom_learning_rate)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'accuracy'])  # Specify accuracy for both outputs

# Train the model
history = model.fit(X_train, [y_hazard_train, y_product_train],
                    epochs=10,
                    batch_size=32,
                    validation_data=([X_test, [y_hazard_test, y_product_test]]))

# Evaluate the model
results = model.evaluate(X_test, [y_hazard_test, y_product_test])



Epoch 1/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 110ms/step - hazard_output_accuracy: 0.1471 - hazard_output_loss: 4.3363 - loss: 11.1491 - product_output_accuracy: 0.0172 - product_output_loss: 6.8128 - val_hazard_output_accuracy: 0.3442 - val_hazard_output_loss: 3.0231 - val_loss: 9.2982 - val_product_output_accuracy: 0.0586 - val_product_output_loss: 6.2964
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 61ms/step - hazard_output_accuracy: 0.3850 - hazard_output_loss: 2.7127 - loss: 8.5161 - product_output_accuracy: 0.0666 - product_output_loss: 5.8035 - val_hazard_output_accuracy: 0.4774 - val_hazard_output_loss: 2.5423 - val_loss: 8.6468 - val_product_output_accuracy: 0.0938 - val_product_output_loss: 6.1292
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 61ms/step - hazard_output_accuracy: 0.5501 - hazard_output_loss: 2.0661 - loss: 7.1829 - product_output_accuracy: 0.1292 - product_output_los

## Section 1.2: Print individual class confusion metrics and overall accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predictions
y_hazard_pred, y_product_pred = model.predict(X_test)

# Convert predictions from probabilities to class labels
y_hazard_pred_classes = np.argmax(y_hazard_pred, axis=1)
y_product_pred_classes = np.argmax(y_product_pred, axis=1)

# Ensure to use the same number of test samples for true labels
y_hazard_test_encoded = np.argmax(y_hazard_test, axis=1)
y_product_test_encoded = np.argmax(y_product_test, axis=1)

# Calculate confusion matrices
confusion_hazard = confusion_matrix(y_hazard_test_encoded, y_hazard_pred_classes)
confusion_product = confusion_matrix(y_product_test_encoded, y_product_pred_classes)

# Print confusion matrices and counts for each class
def print_confusion_matrix(confusion, label):
    print(f"\nConfusion Matrix for {label}:")
    print(confusion)

    for i in range(confusion.shape[0]):
        tn = confusion[i].sum() - confusion[i, i]  # Total predicted negatives - true positives
        fp = confusion[:, i].sum() - confusion[i, i]  # Total actual positives - true positives
        fn = confusion[i, :].sum() - confusion[i, i]  # Total actual negatives - true positives
        tp = confusion[i, i]  # True positives

        print(f"\nClass {i}:")
        print(f"Total True Positives: {tp}")
        print(f"Total False Positives: {fp}")
        print(f"Total True Negatives: {tn}")
        print(f"Total False Negatives: {fn}")

print_confusion_matrix(confusion_hazard, 'Hazard Category')
print_confusion_matrix(confusion_product, 'Product Category')

# Print classification reports
print("\nHazard Classification Report:")
print(classification_report(y_hazard_test_encoded, y_hazard_pred_classes))

print("Product Classification Report:")
print(classification_report(y_product_test_encoded, y_product_pred_classes))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step

Confusion Matrix for Hazard Category:
[[2 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 4 0]
 [0 0 0 ... 0 0 0]]

Class 0:
Total True Positives: 2
Total False Positives: 0
Total True Negatives: 3
Total False Negatives: 3

Class 1:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 2
Total False Negatives: 2

Class 2:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 1
Total False Negatives: 1

Class 3:
Total True Positives: 4
Total False Positives: 4
Total True Negatives: 2
Total False Negatives: 2

Class 4:
Total True Positives: 0
Total False Positives: 7
Total True Negatives: 5
Total False Negatives: 5

Class 5:
Total True Positives: 6
Total False Positives: 9
Total True Negatives: 4
Total False Negatives: 4

Class 6:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 1
Total False Negatives: 1

Class 7:
T

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 1.3: Print overall F1 scores

In [None]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 score for hazards
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )

    # Compute F1 score for products
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    # Compute overall F1 score
    overall_f1 = (f1_hazards + f1_products) / 2

    return overall_f1, f1_hazards, f1_products

# Example usage:
# Assuming you have the true labels and predicted labels
hazards_true = y_hazard_test_encoded  # Replace with your actual labels
products_true = y_product_test_encoded  # Replace with your actual labels
hazards_pred = y_hazard_pred_classes  # Replace with your predicted labels
products_pred = y_product_pred_classes  # Replace with your predicted labels

overall_f1, f1_hazards, f1_products = compute_score(hazards_true, products_true, hazards_pred, products_pred)

print(f'Overall F1 Score: {overall_f1:.2f}')
print(f'Hazard F1 Score: {f1_hazards:.2f}')
print(f'Product F1 Score: {f1_products:.2f}')


Overall F1 Score: 0.19
Hazard F1 Score: 0.25
Product F1 Score: 0.13


# *End of Section 1*

# Section 2: Run classification on Title only

## Section 2.1: Run ANN separately for product-category and hazard-category

In [None]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Dense
from keras.utils import to_categorical
from keras.initializers import RandomNormal
from sklearn.metrics import confusion_matrix, classification_report
from keras.optimizers import Adam


# Load your dataset from the CSV file
df = pd.read_csv('/content/final_cleaned_train.csv')

# Combine 'title' and 'text' for classification
df['combined_text'] =  df['title']

# Clean the text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    return text

df['cleaned_text'] = df['combined_text'].apply(clean_text)

# Count Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text']).toarray()  # Convert to dense array
y_hazard = df['hazard'].values  # Use hazard for labels
y_product = df['product'].values  # Use product for labels

# Encode labels
label_encoder_hazard = LabelEncoder()
y_hazard_encoded = label_encoder_hazard.fit_transform(y_hazard)
y_hazard_categorical = to_categorical(y_hazard_encoded)

label_encoder_product = LabelEncoder()
y_product_encoded = label_encoder_product.fit_transform(y_product)
y_product_categorical = to_categorical(y_product_encoded)

# Split the dataset into training and testing sets
X_train, X_test, y_hazard_train, y_hazard_test, y_product_train, y_product_test = train_test_split(
    X, y_hazard_categorical, y_product_categorical, test_size=0.2, random_state=42
)

# Create the ANN model
input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05))(input_layer)
hidden_layer = Dense(32, activation='relu', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05))(hidden_layer)

# Output layers for hazard and product categories
hazard_output = Dense(y_hazard_categorical.shape[1], activation='softmax', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05), name='hazard_output')(hidden_layer)
product_output = Dense(y_product_categorical.shape[1], activation='softmax', kernel_initializer=RandomNormal(mean=0.0, stddev=0.05), name='product_output')(hidden_layer)

# Define the model
model = Model(inputs=input_layer, outputs=[hazard_output, product_output])


# Change learning rate to 0.0001
custom_learning_rate = 0.01
optimizer = Adam(learning_rate=custom_learning_rate)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'accuracy'])  # Specify accuracy for both outputs

# Train the model
history = model.fit(X_train, [y_hazard_train, y_product_train],
                    epochs=10,
                    batch_size=32,
                    validation_data=([X_test, [y_hazard_test, y_product_test]]))

# Evaluate the model
results = model.evaluate(X_test, [y_hazard_test, y_product_test])



Epoch 1/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 45ms/step - hazard_output_accuracy: 0.1026 - hazard_output_loss: 4.5962 - loss: 11.4067 - product_output_accuracy: 0.0245 - product_output_loss: 6.8104 - val_hazard_output_accuracy: 0.1650 - val_hazard_output_loss: 3.5722 - val_loss: 9.8177 - val_product_output_accuracy: 0.0310 - val_product_output_loss: 6.2651
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - hazard_output_accuracy: 0.2202 - hazard_output_loss: 3.4364 - loss: 9.4999 - product_output_accuracy: 0.0342 - product_output_loss: 6.0632 - val_hazard_output_accuracy: 0.2420 - val_hazard_output_loss: 3.2593 - val_loss: 9.4843 - val_product_output_accuracy: 0.0436 - val_product_output_loss: 6.2470
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - hazard_output_accuracy: 0.2772 - hazard_output_loss: 2.9616 - loss: 8.8092 - product_output_accuracy: 0.0464 - product_output_loss:

## Section 2.2: Print individual class confusion metrics and overall accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predictions
y_hazard_pred, y_product_pred = model.predict(X_test)

# Convert predictions from probabilities to class labels
y_hazard_pred_classes = np.argmax(y_hazard_pred, axis=1)
y_product_pred_classes = np.argmax(y_product_pred, axis=1)

# Ensure to use the same number of test samples for true labels
y_hazard_test_encoded = np.argmax(y_hazard_test, axis=1)
y_product_test_encoded = np.argmax(y_product_test, axis=1)

# Calculate confusion matrices
confusion_hazard = confusion_matrix(y_hazard_test_encoded, y_hazard_pred_classes)
confusion_product = confusion_matrix(y_product_test_encoded, y_product_pred_classes)

# Print confusion matrices and counts for each class
def print_confusion_matrix(confusion, label):
    print(f"\nConfusion Matrix for {label}:")
    print(confusion)

    for i in range(confusion.shape[0]):
        tn = confusion[i].sum() - confusion[i, i]  # Total predicted negatives - true positives
        fp = confusion[:, i].sum() - confusion[i, i]  # Total actual positives - true positives
        fn = confusion[i, :].sum() - confusion[i, i]  # Total actual negatives - true positives
        tp = confusion[i, i]  # True positives

        print(f"\nClass {i}:")
        print(f"Total True Positives: {tp}")
        print(f"Total False Positives: {fp}")
        print(f"Total True Negatives: {tn}")
        print(f"Total False Negatives: {fn}")

print_confusion_matrix(confusion_hazard, 'Hazard Category')
print_confusion_matrix(confusion_product, 'Product Category')

# Print classification reports
print("\nHazard Classification Report:")
print(classification_report(y_hazard_test_encoded, y_hazard_pred_classes))

print("Product Classification Report:")
print(classification_report(y_product_test_encoded, y_product_pred_classes))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

Confusion Matrix for Hazard Category:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Class 0:
Total True Positives: 0
Total False Positives: 1
Total True Negatives: 5
Total False Negatives: 5

Class 1:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 2
Total False Negatives: 2

Class 2:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 1
Total False Negatives: 1

Class 3:
Total True Positives: 1
Total False Positives: 0
Total True Negatives: 5
Total False Negatives: 5

Class 4:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 5
Total False Negatives: 5

Class 5:
Total True Positives: 4
Total False Positives: 7
Total True Negatives: 6
Total False Negatives: 6

Class 6:
Total True Positives: 0
Total False Positives: 0
Total True Negatives: 1
Total False Negatives: 1

Class 7:
To

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 2.3: Print overall F1 scores

In [None]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Compute F1 score for hazards
    f1_hazards = f1_score(
        hazards_true,
        hazards_pred,
        average='macro'
    )

    # Compute F1 score for products
    f1_products = f1_score(
        products_true[hazards_pred == hazards_true],
        products_pred[hazards_pred == hazards_true],
        average='macro'
    )

    # Compute overall F1 score
    overall_f1 = (f1_hazards + f1_products) / 2

    return overall_f1, f1_hazards, f1_products

# Example usage:
# Assuming you have the true labels and predicted labels
hazards_true = y_hazard_test_encoded  # Replace with your actual labels
products_true = y_product_test_encoded  # Replace with your actual labels
hazards_pred = y_hazard_pred_classes  # Replace with your predicted labels
products_pred = y_product_pred_classes  # Replace with your predicted labels

overall_f1, f1_hazards, f1_products = compute_score(hazards_true, products_true, hazards_pred, products_pred)

print(f'Overall F1 Score: {overall_f1:.2f}')
print(f'Hazard F1 Score: {f1_hazards:.2f}')
print(f'Product F1 Score: {f1_products:.2f}')


Overall F1 Score: 0.13
Hazard F1 Score: 0.14
Product F1 Score: 0.11


# *End of Section 2*

In [None]:
# Generate predictions for the entire dataset
hazard_predictions, product_predictions = model.predict(X)

# Decode the predictions back to their original labels
hazard_predicted_labels = label_encoder_hazard.inverse_transform(np.argmax(hazard_predictions, axis=1))
product_predicted_labels = label_encoder_product.inverse_transform(np.argmax(product_predictions, axis=1))

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'hazard': hazard_predicted_labels,
    'product': product_predicted_labels
}, index=df.index)  # Ensure the index matches the original DataFrame

# Save the results to a CSV file
results_df.to_csv('submission.csv', index_label='', index=True)


[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
