In [1]:
from google.colab import files
uploaded = files.upload()

Saving incidents.csv to incidents.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving incidents_train.csv to incidents_train.csv


In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from gensim.models import Word2Vec
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

nlp = spacy.load("en_core_web_sm")


# Data Cleaning

def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Loading Data
train_data = pd.read_csv("/content/incidents_train.csv")
test_data = pd.read_csv("/content/incidents.csv")

# Applying Cleaning
train_data['title'] = train_data['title'].apply(clean_text)
train_data['text'] = train_data['text'].apply(clean_text)
test_data['title'] = test_data['title'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

ST2


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import zipfile
import gc

# Compressing CSV Files
def compress_csv_files(folder, output_zip):
    with zipfile.ZipFile(output_zip, "w") as zipf:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.endswith(".csv"):
                    zipf.write(os.path.join(root, file), arcname=file)
    print(f"All predictions saved to {output_zip}")

# Handling Rare Classes
def handle_rare_classes(data, target, threshold=5):
    class_counts = data[target].value_counts()
    rare_classes = class_counts[class_counts < threshold].index
    data[target] = data[target].replace(rare_classes, 'other')
    return data

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Training Model for ST2
def train_model_st2(X_train, X_valid, y_train, y_valid, model_type):
    # Combining labels for unseen categories
    all_labels = pd.concat([y_train, y_valid], axis=0).unique()
    label_mapping = {label: idx for idx, label in enumerate(sorted(all_labels))}

    y_train_mapped = y_train.map(label_mapping)
    y_valid_mapped = y_valid.map(label_mapping)

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
    X_train_vec = vectorizer.fit_transform(X_train)
    X_valid_vec = vectorizer.transform(X_valid)

    # Model Selection
    if model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=300, class_weight='balanced', n_jobs=-1)
    elif model_type == 'xgboost':
        model = XGBClassifier(
            n_estimators=50,
            learning_rate=0.1,
            max_depth=4,
            verbosity=1,
            n_jobs=-1
        )

    # Handling class imbalance using SMOTE
    class_counts = pd.Series(y_train_mapped).value_counts()
    minority_class_size = class_counts.min()

    # Adjusting k_neighbors dynamically based on the size of the minority class
    k_neighbors = min(2, minority_class_size - 1)  # Ensuring k_neighbors <= minority_class_size - 1

    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_train_vec_res, y_train_res = smote.fit_resample(X_train_vec, y_train_mapped)

    # Training the model
    model.fit(X_train_vec_res, y_train_res)

    # Predicting and evaluating
    y_pred = model.predict(X_valid_vec)
    f1 = f1_score(y_valid_mapped, y_pred, average='macro', zero_division=0)

    return model, f1, vectorizer, label_mapping



# Predicting Test Data for ST2
def predict_test_data_st2(test_data, feature, model, vectorizer, label_mapping):
    X_test_vec = vectorizer.transform(test_data[feature])
    predictions = model.predict(X_test_vec)
    inv_label_mapping = {v: k for k, v in label_mapping.items()}
    return pd.Series(predictions).map(inv_label_mapping)

# Running ST2
def run_st2(train_data, test_data):
    features = ['title', 'text']
    tasks = ['hazard', 'product']
    models = ['logistic_regression', 'xgboost']

    # Ensuring predictions folder exists
    os.makedirs("predictions_ST2", exist_ok=True)

    for feature in features:
        for model_type in models:
            print(f"Running ST2 using {feature} with {model_type}")
            predictions = pd.DataFrame()

            for target in tasks:
                print(f"  Processing target: {target}")
                train_data = handle_rare_classes(train_data, target, threshold=2)

                # Stratified splitting
                splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
                train_idx, valid_idx = next(splitter.split(train_data[feature], train_data[target]))
                X_train, X_valid = train_data[feature].iloc[train_idx], train_data[feature].iloc[valid_idx]
                y_train, y_valid = train_data[target].iloc[train_idx], train_data[target].iloc[valid_idx]

                # Training and evaluating the model
                model, f1, vectorizer, label_mapping = train_model_st2(X_train, X_valid, y_train, y_valid, model_type)
                print(f"    F1 Score for {target}: {f1:.4f}")

                # Predicting on test data
                test_predictions = predict_test_data_st2(test_data, feature, model, vectorizer, label_mapping)
                predictions[target] = test_predictions

            # Saving predictions
            predictions.to_csv(f"predictions_ST2/ST2_{feature}_{model_type}_submission.csv", index=False)

    # Compressing results
    compress_csv_files("predictions_ST2", "predictions_ST2.zip")

# Running the ST2 task
run_st2(train_data, test_data)


Running ST2 using title with logistic_regression
  Processing target: hazard
    F1 Score for hazard: 0.3245
  Processing target: product
    F1 Score for product: 0.3181
Running ST2 using title with xgboost
  Processing target: hazard
    F1 Score for hazard: 0.2910
  Processing target: product
    F1 Score for product: 0.2726
Running ST2 using text with logistic_regression
  Processing target: hazard
    F1 Score for hazard: 0.3614
  Processing target: product
    F1 Score for product: 0.2034
Running ST2 using text with xgboost
  Processing target: hazard
    F1 Score for hazard: 0.3018
  Processing target: product


So the best performing system is the Logistic regression using title as we see from the f1 macro scores.

In [2]:
from google.colab import files
uploaded = files.upload()

Saving ST2_title_logistic_regression_submission_1.csv to ST2_title_logistic_regression_submission_1.csv


In [3]:
from google.colab import files
uploaded = files.upload()

Saving submission.csv to submission.csv


In [9]:
import os
import pandas as pd
import zipfile

In [10]:
# File paths for the ST1 and ST2 outputs
st1_file = "/content/submission.csv"
st2_file = "/content/ST2_title_logistic_regression_submission_1.csv"

if not os.path.exists(st1_file):
    raise FileNotFoundError(f"{st1_file} not found.")
if not os.path.exists(st2_file):
    raise FileNotFoundError(f"{st2_file} not found.")

# Loading the CSV files into DataFrames
st1_df = pd.read_csv(st1_file)
st2_df = pd.read_csv(st2_file)

# Processing and merging the DataFrames
st1_df = st1_df[['hazard-category', 'product-category']]
st2_df = st2_df[['hazard', 'product']]
merged_df = pd.concat([st1_df, st2_df], axis=1)

# Adding a sequential column
merged_df.insert(0, ' ', range(1, len(merged_df) + 1))

# Saving the merged DataFrame
submission_file = "submission.csv"
merged_df.to_csv(submission_file, index=False)

# Zipping the submission file
zip_file_name = "submission.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    zipf.write(submission_file)

print(f"Merged submission saved as {submission_file}")
print(f"Zipped submission saved as {zip_file_name}")

Merged submission saved as submission.csv
Zipped submission saved as submission.zip
