Step 1: load dataset and extract keywords

In [81]:
import os
import mlflow
import requests
import json

In [15]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [55]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [56]:
# pre-processing: keywords extraction
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

In [57]:
import math

In [58]:
import numpy as np

In [59]:
import pandas as pd

In [60]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [61]:
raw_dataset = pd.read_csv("gs://novhack_2023_training_data/X_train.csv", index_col=0)

# remove nan rows
raw_dataset.dropna(subset=['samples'], inplace=True)

In [62]:
def get_keyword(text, condi, number):
    # Tokenize the text
  tokens = word_tokenize(text)

  # Convert to lowercase
  tokens = [token.lower() for token in tokens]

  # Remove stopwords and punctuations
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

  # Lemmatize tokens
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Create a frequency distribution of the tokens
  freq_dist = FreqDist(tokens)

  # Calculate the total number of documents and the document frequency of each token
  num_docs = 1
  doc_freq = {}
  for token in set(tokens):
      doc_freq[token] = 1

  # Calculate the TF-IDF scores
  tfidf_scores = {}
  for token in set(tokens):
      tf = freq_dist[token] / len(tokens)
      idf = math.log(num_docs / doc_freq[token])
      tfidf_scores[token] = tf * idf
  
  # Select the top n keywords in proportion with the text length
  # for different conditions
  if condi == 'fixed':
    # if there are the text length is lower than the set words, just take the whole text
    n = min([number,len(text)])

  elif condi == 'prop':
    # use logarithm transformation to reduce # keywords differences
    n = round(len(math.log(text, 2)) * number)
    
  top_n = sorted(tfidf_scores, key=tfidf_scores.get, reverse=True)[:n]
  # convert the result into a string   
  # possible problem: discrete 
  string = ' '.join(top_n)
  return string

In [24]:
# add the extracted keywords as an additional column
keyword_lst = []
log_lst= []
for text in raw_dataset['samples'].tolist():
  try:
    keywords = get_keyword(text, 'fixed', 10)
    keyword_lst.append(keywords)
  except:
    log_lst.append(text)
raw_dataset['keywords'] = keyword_lst

In [64]:
# check first few lines
raw_dataset.head(5)

Unnamed: 0,samples,keywords
0,"After turning the ignition switch off, waiting...",proceeding see read rear airbag battery split ...
1,"With the vehicle stationary, the yaw rate sens...",previously value airbag read improperly proper...
2,Rack and pinion power steering gear assembly m...,value proceed ground power wait disconnect rec...
3,"Clear the DTCs (even if no DTCs are stored, pe...",value read trouble le power wait amount discon...
4,"After turning the ignition switch off, waiting...",value ready read operated proceed dc power act...


Step2: convert raw texts into word embeddings

In [65]:
import spacy

# Load the GloVe model
nlp = spacy.load('en_core_web_md')


In [66]:
# get sent embeddings
def glove_embedding(X_lst):
  vec_lst = []
  for sent in X_lst:

    # Get the sentence embedding by averaging the word embeddings
    doc = nlp(sent)
    embedding = doc.vector
    vec_lst.append(embedding)
  return np.array(vec_lst)

In [27]:
# convert keywords into glove embeddings
X_train = glove_embedding(keyword_lst)

In [70]:
X_train.shape

(11321, 300)

Step3: Employ different search strategies

In [71]:
from sklearn.metrics.pairwise import cosine_distances

In [29]:
from scipy.stats import entropy

In [74]:
def uncertainty_search(X_unlabeled,k,clf):
  probas = clf.predict_proba(X_unlabeled)

  # Calculate the uncertainty score for each sample
  uncertainty_scores = np.max(probas, axis=1)

  # Sort the samples by their uncertainty scores in descending order
  sorted_indices = np.argsort(uncertainty_scores)[::-1]

  # remove the previous labeled ones
  
  # Select the top k most uncertain samples for labeling
  selected_indices = sorted_indices[:k]

  return sorted_indices, selected_indices

In [75]:
def diversity_search(X_unlabeled,k,clf): 
  
  # Calculate the pairwise distances between each pair of samples
  distances = cosine_distances(X_unlabeled)

  # Calculate the diversity score for each sample
  diversity_scores = np.min(distances, axis=1)

  # Sort the samples by their diversity scores in descending order
  sorted_indices = np.argsort(diversity_scores)[::-1]

  # Select the top k most diverse samples for labeling
  selected_indices = sorted_indices[:k]

  return sorted_indices, selected_indices

In [76]:
# not use this for the time being
def committe_search(X_unlabeled,k,models): 
  probas_list = [model.predict_proba(X_unlabeled) for model in models]

  # Calculate the disagreement score for each sample
  disagreement_scores = np.var(probas_list, axis=0)

  # Calculate the mean disagreement score for each sample
  mean_disagreement_scores = np.mean(disagreement_scores, axis=1)

  # Sort the samples by their mean disagreement scores in descending order
  sorted_indices = np.argsort(mean_disagreement_scores)[::-1]

  # Select the top k most disagreed samples for labeling
  selected_indices = sorted_indices[:k]
  return sorted_indices, selected_indices

In [77]:
# Assuming that you have a pre-trained text classification model called `clf`
def entropy_search(X_unlabeled,k,clf): 
  # Calculate the predicted probabilities for each unlabeled sample in the dataset
  probas = clf.predict_proba(X_unlabeled)

  # Calculate the entropy of the predicted probabilities for each sample
  entropy_scores = entropy(probas.T)

  # Calculate the expected information gain for each sample
  expected_information_gain = entropy_scores - np.sum(probas * np.log2(probas), axis=1)

  # Sort the samples by their expected information gain in descending order
  sorted_indices = np.argsort(expected_information_gain)[::-1]

  # Select the top k samples with the highest expected information gain for labeling
  selected_indices = sorted_indices[:k]
  return sorted_indices, selected_indices

In [78]:
# majority voting of the selected indices
from collections import defaultdict

def get_number(my_dict):
  # Create a list of tuples containing the key-value pairs of the dictionary
  my_list = list(my_dict.items())

  # Define a function to extract the value of the second item in a tuple
  def get_value(item):
      return item[1]

  # Sort the list of tuples based on the values
  my_list.sort(key=get_value)

  # Extract the sorted keys from the sorted list of tuples
  sorted_keys = [item[0] for item in my_list]

  return sorted_keys

def rank_nested_lists(nested_lists):
    """
    Given a list of nested lists, output a ranked list of elements based on the number
    of overlapping sublists.

    Args:
    nested_lists (list): a list of nested lists

    Returns:
    list: a ranked list of elements based on the number of overlapping sublists
    """
    flattened = [item for sublist in nested_lists for item in sublist]  # flatten the nested lists
    element_count = {}  # dictionary to store the count of each element
    for element in flattened:
        count = 0
        for sublist in nested_lists:
            if element in sublist:
                count += 1
        element_count[element] = count
    ranked_lst = get_number(element_count)
    ranked_lst.reverse()
    return ranked_lst

def vote_index(X_train,n,clf_rf):
  uncer_sorted, uncer_instance = uncertainty_search(X_train, n ,clf_rf)
  entro_sorted, entro_instance = entropy_search(X_train,n,clf_rf)
  div_sorted, div_instance = diversity_search(X_train,n,clf_rf)
  # get the intersection of the highest indices
  ranked_lst = rank_nested_lists([uncer_sorted,entro_sorted,div_sorted])
  indice_lst = ranked_lst[:n]
  return indice_lst

In [83]:
# request labels from the platform

def get_label(index):
    index_str = str(index)
    parameters = {
    "team_id": "team-11", # example: team-1
    "context": "prod",  # 'dev' or 'prod' context for which you are requesting labels, only prod context gives you data for the challenge, dev is to test the labelling platform
    "data_id": index_str  # id of the datapoint for which you are requesting the label
    }
    
    headers = {
    "authorization": os.environ["JUPYTER_TOKEN"]
    }
    # Do the request
    resp = requests.get("http://labelling.novhack.euranova.eu/label", params=parameters, headers=headers)
    resp.status_code
    label = resp.text
    return label

In [98]:
def previous_label():
    parameters = {
        "team_id": "team-11",
    }
    headers = {
        "authorization": os.environ["JUPYTER_TOKEN"]
    }

    resp = requests.get("http://labelling.novhack.euranova.eu/previous", params=parameters, headers=headers)
    resp.status_code
    # Get the count
    labels = resp.text
    return labels

In [91]:
label = previous_label()
print(label)

{"5":"Power Source / Network"}


In [85]:
# k-means
from sklearn.ensemble import RandomForestClassifier

In [86]:
# Create a Random Forest Classifier with 100 trees
clf_rf = RandomForestClassifier(n_estimators=100)

In [87]:
from sklearn.cluster import KMeans

my_array = np.array(X_train)
kmeans = KMeans(n_clusters=11)

# fit the k-means object to the data
kmeans.fit(my_array)
centers = kmeans.cluster_centers_

closest_points = [np.argpartition(np.linalg.norm(my_array - center, axis=1), 1)[:1] for center in centers]

indice_lst = []

for i, indices in enumerate(closest_points):
    indice_lst.append(indices[0])



In [92]:
init_train = X_train[indice_lst]

In [93]:
indice_lst

[3243, 7761, 2178, 6975, 8794, 8218, 10519, 3126, 5624, 5445, 1843]

In [95]:
init_train.shape

(11, 300)

In [96]:
# request label
label_lst = []
for index in indice_lst:
    label = get_label(index)
    # update the dataset
    label_lst.append(label) 

In [97]:
#Kmeans not successful! 
label_lst

['"Vehicle Interior"',
 '"Engine / Hybrid System"',
 '"Drivetrain"',
 '"Vehicle Exterior"',
 '"Engine / Hybrid System"',
 '"Vehicle Interior"',
 '"Vehicle Interior"',
 '"Vehicle Interior"',
 '"Steering"',
 '"Engine / Hybrid System"',
 '"Power Source / Network"']

In [138]:
# update the known labels of the dataset
result = previous_label()

In [139]:
print(result)

{"5":"Power Source / Network","1843":"Power Source / Network","2178":"Drivetrain","3126":"Vehicle Interior","3243":"Vehicle Interior","5445":"Engine / Hybrid System","5624":"Steering","6975":"Vehicle Exterior","7761":"Engine / Hybrid System","8218":"Vehicle Interior","8794":"Engine / Hybrid System","10519":"Vehicle Interior"}


In [118]:
# clean the labels
requested_index_lst = []
requested_label_lst = []
pair_lst = result.split(',')
for i in pair_lst:
    index = int(i.split(':')[0].split('"')[1])
    label = i.split(':')[1].split('"')[1]
    requested_index_lst.append(index)
    requested_label_lst.append(label)

In [119]:
print(requested_index_lst)
print(requested_label_lst)

[5, 1843, 2178, 3126, 3243, 5445, 5624, 6975, 7761, 8218, 8794, 10519]
['Power Source / Network', 'Power Source / Network', 'Drivetrain', 'Vehicle Interior', 'Vehicle Interior', 'Engine / Hybrid System', 'Steering', 'Vehicle Exterior', 'Engine / Hybrid System', 'Vehicle Interior', 'Engine / Hybrid System', 'Vehicle Interior']


In [141]:
y_updated = requested_label_lst

In [115]:
'''
# map string label list with numbers
num_label_dict = {'Engine / Hybrid System': 0, 'Audio / Visual / Telematics': 1, 'Brake':2,
                       'Vehicle Interior': 3, 'Suspension': 4, 'Vehicle Exterior': 5, 'Drivetrain':6,
                       'Power Source / Network':7, 'Steering':8, 'General':9, 'ADAS / AD':10}

num_list = [num_label_dict[x] for x in requested_label_lst]
y_updated = np.array(num_list)
'''

In [120]:
X_updated = X_train[requested_index_lst]

In [122]:
import mlflow.sklearn

In [124]:
class RandomForest(mlflow.pyfunc.PythonModel):
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    def fit(self, X_updated, y_updated):
        with mlflow.start_run():
            self.rf.fit(X_updated, y_updated)
            mlflow.sklearn.log_model(self.rf, "random-forest-model")
            
    def predict(self, context, model_input):
        return self.choices(self.labels, k=len(model_input))

    def load_context(self, context):
        self.labels = ['Engine / Hybrid System', 'Audio / Visual / Telematics', 'Brake',
                       'Vehicle Interior', 'Suspension', 'Vehicle Exterior', 'Drivetrain',
                       'Power Source / Network', 'Steering', 'General', 'ADAS / AD']
        from random import choices 
        self.choices = choices
        pass

In [125]:
with mlflow.start_run(run_name="Test Custom model") as run:
    model = RandomForest()

    mlflow.log_param("param_1", 3)
    mlflow.log_metrics({"metric_1": 2, "metric_2": 2})

    # Log the sklearn model and register as version 1
    mlflow.pyfunc.log_model(
        artifact_path="custom-model",
        python_model=model,
        code_path = ["."],
        registered_model_name="custom-model"
    )

Registered model 'custom-model' already exists. Creating a new version of this model...
2023/03/12 02:25:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: custom-model, version 4
Created version '4' of model 'custom-model'.


In [126]:
client = mlflow.MlflowClient()
client.transition_model_version_stage(
    name="custom-model",
    version=1,
    stage="Production"
)

<ModelVersion: creation_timestamp=1678542118711, current_stage='Production', description='', last_updated_timestamp=1678587973471, name='custom-model', run_id='bcfdc5da3b004163a18a44dc441ff7a3', run_link='', source='gs://novhack2023_mlflow_bucket_team-11/0/bcfdc5da3b004163a18a44dc441ff7a3/artifacts/custom-model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [140]:
# Fit the model to the training data
clf_rf.fit(X_updated,y_updated)

ValueError: Found input variables with inconsistent numbers of samples: [11321, 2]

In [None]:
# submit the model

In [40]:
'''
# random initialization
import random

# Generate 20 random integers between 0 and 99
n = 20
rand_ints = random.sample(range(401,800), n)
X_init_train_temp = X_all[rand_ints]
X_init_train = glove_embedding(X_init_train_temp.tolist())

y_init_train = y_all[rand_ints]

# Fit the model to the training data
clf_rf.fit(X_init_train,y_init_train)
'''

In [128]:
from sklearn.metrics import accuracy_score

In [131]:
# request labels
def update(X_train,indice_lst):
    label_lst = []
    for index in indice_lst:
        label = get_label(index)
        # update the dataset
        label_lst.append(label) 
        
    # update current label repo
    result = previous_label()
    # clean the labels
    requested_index_lst = []
    requested_label_lst = []
    pair_lst = result.split(',')
    for i in pair_lst:
        index = int(i.split(':')[0].split('"')[1])
        label = i.split(':')[1].split('"')[1]
        requested_index_lst.append(index)
        requested_label_lst.append(label)
        
    X_updated = X_train[requested_index_lst]
    y_updated = requested_label_lst
    
    return X_updated, y_updated

In [None]:
X_updated

In [133]:
# backup code for training
num_queries = 2
n = 2
for i in range(num_queries):
    # ...where each iteration consists of labelling 20 samples
    # ensure the requested instances do not occur with the previously requested ones

    # change this into majority voting
    #sorted, q_indices = uncertainty_search(X_train,n,clf_rf)
    q_indices = vote_index(X_train,n,clf_rf)
    
    X_updated, y_updated = (X_train,q_indices)
    
    # retrain the labels for the current query to the active learner.
    clf_rf.fit(X_updated, y_updated)

  expected_information_gain = entropy_scores - np.sum(probas * np.log2(probas), axis=1)
  expected_information_gain = entropy_scores - np.sum(probas * np.log2(probas), axis=1)


ValueError: Found input variables with inconsistent numbers of samples: [11321, 2]

In [137]:
q_indices 

[5406, 7143]

In [136]:
X_updated.shape

(11321, 300)

In [46]:
'''
# backup code for training
num_queries = 5
n = 2
for i in range(num_queries):
    # ...where each iteration consists of labelling 20 samples
    # ensure the requested instances do not occur with the previously requested ones

    # change this into majority voting
    #sorted, q_indices = uncertainty_search(X_train,n,clf_rf)
    q_indices = vote_index(X_train,n,clf_rf)
    
    # request labels
    X_labeled_train = X_train[q_indices]
    y_labeled_train = y_train[q_indices]

    if i > 0:
      X_labeled_train_all = np.concatenate((X_labeled_train_all,X_labeled_train))
      y_labeled_train_all = np.concatenate((y_labeled_train_all,y_labeled_train))
    
    else:
      X_labeled_train_all = X_train[q_indices]
      y_labeled_train_all = y_train[q_indices]

    # retrain the labels for the current query to the active learner.
    clf_rf.fit(X_labeled_train_all, y_labeled_train_all)
'''

---------------
0.24
---------------
0.24
---------------
0.24
---------------
0.24
---------------
0.24
