# Fetch the Data

In [None]:
import pandas as pd

train_sheet_url = 'https://docs.google.com/spreadsheets/d/1_R60tCWgxvUYEAueo0GDAGXBj_hZxehQnofH3vd9D_c/edit?gid=689418515'
train_url = train_sheet_url.replace('/edit?gid=', '/export?format=xlsx&gid=')
train_df = pd.read_excel(train_url)

test_sheet_url = 'https://docs.google.com/spreadsheets/d/1rPOSIC66IoDTl0DWyvbhcx4g8uswuA4hgJBZBa14TDA/edit?gid=1695626307'
test_url = test_sheet_url.replace('/edit?gid=', '/export?format=xlsx&gid=')
test_df = pd.read_excel(test_url)

In [None]:
df = pd.concat([train_df, test_df], axis=0)
df_test = df.copy()
df.dropna(inplace=True)
df.to_csv("base_data.csv", index=False)

# Create Embeddings

### Define LaBSE Model and Embedding Generation Function

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/LaBSE')
def generate_embeddings_labse(texts):
    return sbert_model.encode(texts, batch_size=64, show_progress_bar=True)

### Define MPNet Model and Embedding Generation Function

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
def generate_embeddings_mpnet(texts):
    return sbert_model.encode(texts, batch_size=128, show_progress_bar=True)

### Generate and Save Embeddings with Categories

In [None]:
embeddings_labse = generate_embeddings_labse(list(df['crimeaditionalinfo'].astype(str)))
embeddings_labse_df = pd.DataFrame(embeddings_labse)
embeddings_labse_df['category'] = df['category']
embeddings_labse_df['sub_category'] = df['sub_category']
embeddings_labse_df.to_csv('LaBSE.csv', index=False)


embeddings_mpnet = generate_embeddings_mpnet(list(df['crimeaditionalinfo'].astype(str)))
embeddings_mpnet_df = pd.DataFrame(embeddings_mpnet)
embeddings_mpnet_df['category'] = df['category']
embeddings_mpnet_df['sub_category'] = df['sub_category']
embeddings_mpnet_df.to_csv('mpnet.csv', index=False)

In [None]:
import pandas as pd
embeddings_labse = pd.read_csv('LaBSE.csv')
embeddings_mpnet = pd.read_csv('mpnet.csv')

embeddings_labse.drop(['category', 'sub_category'], axis=1,inplace=True)
embeddings_labse.dropna(inplace=True)

embeddings_mpnet.drop(['category', 'sub_category'], axis=1,inplace=True)
embeddings_mpnet.dropna(inplace=True)

# Train Model

In [None]:
from cuml.linear_model import LogisticRegression as cuLogisticRegression
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier
from cuml.neighbors import KNeighborsClassifier as cuKNeighborsClassifier
from cuml.ensemble import RandomForestClassifier as cuExtraTreesClassifier  # Use RandomForest for ExtraTrees
from cuml.naive_bayes import GaussianNB as cuGaussianNB
from cuml.svm import SVC
from cuml.naive_bayes import BernoulliNB
from sklearn.ensemble import VotingClassifier
from cuml.multiclass import MulticlassClassifier
from cuml.naive_bayes import MultinomialNB


estimators_labse_category = [
    ('labse_category_log_reg', cuLogisticRegression(C=1.0, solver='qn', max_iter=1000)),
    ('labse_category_random_forest', cuRandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('labse_category_knn', cuKNeighborsClassifier(n_neighbors=5)),
    ('labse_category_extra_trees', cuExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('labse_category_bernoulli_nb', BernoulliNB(alpha=1.0)),
    ('labse_category_multi_nb', MultinomialNB())
]

estimators_mpnet_category = [
    ('mpnet_category_log_reg', cuLogisticRegression(C=1.0, solver='qn', max_iter=1000)),
    ('mpnet_category_random_forest', cuRandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('mpnet_category_knn', cuKNeighborsClassifier(n_neighbors=5)),
    ('mpnet_category_extra_trees', cuExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('mpnet_category_bernoulli_nb', BernoulliNB(alpha=1.0)),
    ('mpnet_category_multi_nb', MultinomialNB())
]

estimators_labse_sub_category = [
    ('labse_sub_category_log_reg', cuLogisticRegression(C=1.0, solver='qn', max_iter=1000)),
    ('labse_sub_category_random_forest', cuRandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('labse_sub_category_knn', cuKNeighborsClassifier(n_neighbors=5)),
    ('labse_sub_category_extra_trees', cuExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('labse_sub_category_bernoulli_nb', BernoulliNB(alpha=1.0)),
    ('labse_sub_category_multi_nb', MultinomialNB())
]

estimators_mpnet_sub_category = [
    ('mpnet_sub_category_log_reg', cuLogisticRegression(C=1.0, solver='qn', max_iter=1000)),
    ('mpnet_sub_category_random_forest', cuRandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('mpnet_sub_category_knn', cuKNeighborsClassifier(n_neighbors=5)),
    ('mpnet_sub_category_extra_trees', cuExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('mpnet_sub_category_bernoulli_nb', BernoulliNB(alpha=1.0)),
    ('mpnet_sub_category_multi_nb', MultinomialNB())
]

### Model training and inference functions 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

def train_classifiers(estimators_1, estimators_2, df1, df2, y):
    """
    Train a set of classifiers on two separate data frames and aggregate their outputs for further training.

    Parameters:
    - estimators_1: List of tuples (name, model), where each model is trained on `df1`
    - estimators_2: List of tuples (name, model), where each model is trained on `df2`
    - df1: DataFrame with features for the first set of estimators
    - df2: DataFrame with features for the second set of estimators
    - y: Labels for training the models

    Returns:
    - Trained estimators_1 and estimators_2, and the final neural network model
    """
    # Train models on df1
    for name, model in estimators_1:
        df = df1.copy()
        print(f'Training {name} on df1 -> {df.shape}')
        model.fit(df, y)

    # Aggregate predictions from estimators_1
    predictions_1 = np.array([np.argmax(model.predict_proba(df1), axis=1) for _, model in estimators_1]).T
    predictions_1_prob = np.array([np.max(model.predict_proba(df1), axis=1) for _, model in estimators_1]).T
    
    df_tmp1 = pd.DataFrame(predictions_1, columns = [name for name, _ in estimators_1])
    df_tmp2 = pd.DataFrame(predictions_1_prob, columns = [name+'_prob' for name, _ in estimators_1])

    df_x1 = pd.concat([df_tmp1, df_tmp2], axis=1)

    # Train models on df2
    for name, model in estimators_2:
        df = df2.copy()
        print(f'Training {name} on df2 -> {df.shape}')
        model.fit(df, y)

    # Aggregate predictions from estimators_2
    predictions_2 = np.array([np.argmax(model.predict_proba(df2), axis=1) for _, model in estimators_2]).T
    predictions_2_prob = np.array([np.max(model.predict_proba(df2), axis=1) for _, model in estimators_2]).T
    
    df_tmp1 = pd.DataFrame(predictions_2, columns = [name for name, _ in estimators_2])
    df_tmp2 = pd.DataFrame(predictions_2_prob, columns = [name+'_prob' for name, _ in estimators_2])

    df_x2 = pd.concat([df_tmp1, df_tmp2], axis=1)
    
    # Concatenate the predictions from both sets of estimators
    df_total = pd.concat([df1, df2, df_x1, df_x2], axis=1)

    df_total.fillna(0, inplace=True)
    
    # Define and compile the final neural network model
    final_model = models.Sequential([
        layers.Dense(64, input_dim=df_total.shape[1], activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(len(np.unique(y)), activation='softmax')  # Correct output dimension
    ])
    final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train and evaluate the neural network model
    y_one_hot = to_categorical(y, num_classes=len(np.unique(y)))

    final_model.fit(df_total, y_one_hot, epochs=10, batch_size=32, verbose=1)
    final_model.evaluate(df_total, y_one_hot)
    
    return estimators_1, estimators_2, final_model




def make_prediction(estimators_1, estimators_2, final_model, df1, df2):
    """
    Make predictions using trained classifiers and the final neural network model.

    Parameters:
    - estimators_1: List of tuples (name, model), where each model makes predictions on `df1`
    - estimators_2: List of tuples (name, model), where each model makes predictions on `df2`
    - final_model: Trained neural network model for final predictions
    - df1: DataFrame with features for the first set of estimators
    - df2: DataFrame with features for the second set of estimators

    Returns:
    - Predictions from the final model
    """
    # Generate predictions (labels and probabilities) from estimators_1
    predictions_1_labels = np.array([np.argmax(model.predict_proba(df1), axis=1) for _, model in estimators_1]).T
    predictions_1_probs = np.array([np.max(model.predict_proba(df1), axis=1) for _, model in estimators_1]).T
    
    df_tmp1_labels = pd.DataFrame(predictions_1_labels, columns=[name for name, _ in estimators_1])
    df_tmp1_probs = pd.DataFrame(predictions_1_probs, columns=[name + '_prob' for name, _ in estimators_1])
    df_x1 = pd.concat([df_tmp1_labels, df_tmp1_probs], axis=1)

    # Generate predictions (labels and probabilities) from estimators_2
    predictions_2_labels = np.array([np.argmax(model.predict_proba(df2), axis=1) for _, model in estimators_2]).T
    predictions_2_probs = np.array([np.max(model.predict_proba(df2), axis=1) for _, model in estimators_2]).T
    
    df_tmp2_labels = pd.DataFrame(predictions_2_labels, columns=[name for name, _ in estimators_2])
    df_tmp2_probs = pd.DataFrame(predictions_2_probs, columns=[name + '_prob' for name, _ in estimators_2])
    df_x2 = pd.concat([df_tmp2_labels, df_tmp2_probs], axis=1)

    # Concatenate both sets of predictions along with the input features for the final model
    df_total = pd.concat([df1, df2, df_x1, df_x2], axis=1)
    df_total.fillna(0, inplace=True)

    # Use the final neural network model to make predictions
    final_predictions = final_model.predict(df_total)

    return final_predictions

def convert_number_to_category(label_encoder, predictions):
    """
    Convert numerical predictions to categorical labels using a label encoder.
    Parameters:
    - label_encoder: Fitted LabelEncoder instance
    - predictions: Numerical predictions from the final model
    Returns:
    - Array of categorical predictions
    """
    return label_encoder.inverse_transform(predictions.argmax(axis=1))

def pre_process_dataframe_for_making_prediction(df):
    """
    Preprocess input data by generating embeddings for prediction.
    Parameters:
    - df: Input DataFrame with a 'crimeaditionalinfo' column
    Returns:
    - df_1: DataFrame of embeddings generated by LaBSE
    - df_2: DataFrame of embeddings generated by MPNet
    """
    df_1 = generate_embeddings_labse(list(df['crimeaditionalinfo'].astype(str)))
    df_2 = generate_embeddings_mpnet(list(df['crimeaditionalinfo'].astype(str)))
    return df_1, df_2

# Train the model

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoders
label_encoder_category = LabelEncoder()
label_encoder_sub_category = LabelEncoder()

# Fit the label encoders on unique values in each column
label_encoder_category.fit(df['category'].unique())
label_encoder_sub_category.fit(df['sub_category'].unique())

# Transform the 'category' and 'sub_category' columns to numerical labels
y_category = label_encoder_category.transform(df['category'])
y_sub_category = label_encoder_sub_category.transform(df['sub_category'])


In [None]:
# Train classifiers for the 'category' labels
estimators_labse_category_trained, estimators_mpnet_category_trained, nn_category_trained = train_classifiers(
    estimators_labse_category, 
    estimators_mpnet_category, 
    embeddings_labse, 
    embeddings_mpnet, 
    y_category
)

# Train classifiers for the 'sub_category' labels
estimators_labse_sub_category_trained, estimators_mpnet_sub_category_trained, nn_sub_category_trained = train_classifiers(
    estimators_labse_sub_category, 
    estimators_mpnet_sub_category, 
    embeddings_labse, 
    embeddings_mpnet, 
    y_sub_category
)


### Make prediction on missing values

In [None]:
X_test_sub_category = df_test[(df_test['sub_category'].isna()) & (df_test['crimeaditionalinfo'].notna())]
X_test_sub_category_labse, X_test_sub_category_mnpt = pre_process_dataframe_for_making_prediction(X_test_subcategory)
y_prediction_labeled = make_prediction(estimators_labse_sub_category_trained, estimators_mpnet_sub_category_trained, nn_sub_category_trained, X_test_sub_category_labse, X_test_sub_category_mnpt)
y_prediction = convert_number_to_category(label_encoder_sub_category, y_prediction_labeled)