In [None]:
!pip install tensorflow_text
!pip install pandas
!pip install --upgrade tensorflow-hub

In [None]:
# need to reload the custom layers so they can be passed to our model on load
# finetuning functions
from keras import backend as K

from keras.callbacks import ModelCheckpoint

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
# imports and function/variable setting
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as text

import re
import pandas as pd
import numpy as np
import nltk
from string import punctuation

nltk.download('stopwords')
nltk.download('punkt')

punctuation = list(punctuation)

# these characters are reminants of typos
punctuation.extend(["''", '``'])

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

def clean_desc(desc):
    if type(desc) is str:
        tokens = word_tokenize(desc.lower())

        stwords = stopwords.words('english')
        newDesc = [token for token in tokens if token not in stwords and token not in punctuation]
        
        return newDesc

class_keys = {'1_facilities_and_construction': 0,
              '2_professional_services': 1,
              '3_information_technology': 2,
              '4_medical': 3,
              '5_transportation_and_logistics': 4,
              '6_industrial_products_and_services': 5,
              '7_travel': 6,
              '8_security_and_protection': 7,
              '9_human_capital': 8,
              '10_office_management': 9,
              '11_defence': 10}

# load dataset for creating test_set
og_data = pd.read_csv('../input/datagovclassifier/rule_based_contracts_v1.csv', dtype='str', index_col=0)


# load model
model = keras.models.load_model(("../input/datagovclassifier/BEST-n1-weights-improvement-22-0.92.hdf5"), custom_objects={'KerasLayer':hub.KerasLayer,
                                                                                                 'balanced_recall': balanced_recall, 
                                                                                                 'balanced_precision': balanced_precision, 
                                                                                                 'balanced_f1_score': balanced_f1_score})

# Keywords for Low Confidence Score

In [None]:
unique_classifiers = [c for c in og_data['category'].unique() if str(c) != 'nan' and len(str(c)) > 1]

unique_classifiers

In [None]:
cate_keywords = {}
for c in unique_classifiers:
    class_df = og_data.loc[og_data['category'] == c]

    class_ls = class_df['description_en'].tolist()

    class_str = ' '.join(str(w) for w in class_ls).lower()
    class_str = clean_desc(class_str)
    class_str = [t for t in class_str if t.isalpha()]

    dist = FreqDist(class_str)

    word_freq = [ws[0] for ws in dist.most_common(30)]
    
    cate_keywords[c] = word_freq
    print(c)
    print(cate_keywords[c])

In [None]:
# identify duplicates in keyword lists so they can be removed

# using sets to ensure no duplicates
distinct = set()
duplicate = set()

for k, v in cate_keywords.items():
  for i in set(v):
    if i in distinct:
      duplicate.add(i)
    else:
      distinct.add(i)

print(distinct)
print(duplicate)

In [None]:
# remove duplicates from keyword lists
for k, v in cate_keywords.items():
    print(k)
    print(v)
    intersection = set(v) - duplicate
    print(intersection)
    print('------------------')
    cate_keywords[k] = list(intersection)

# Apply to Dataset

In [None]:
unclassified = og_data.loc[og_data['category'].isna() & og_data['description_en'].notna()].copy()
unclassified

In [None]:
predictions = []
for i in unclassified['description_en']:
    desc = []
    desc.append(i)
    predictions.append(desc)

In [None]:
for i, pred in enumerate(model.predict(unclassified['description_en'])):
    predictions[i].extend([pred, 
                           list(class_keys)[np.argmax(pred)], 
                           list(class_keys)[np.argsort(pred, axis=0)[-2]],
                           np.max(pred) * 100])    

In [None]:
predictions[0]

In [None]:
# additional catch to compensate for lower confidence score 
verified_predictions = []
for pred in predictions:
    if pred[4] <= 70:
        desc = clean_desc(pred[0])

        curr_cate = cate_keywords[pred[2]]
        alt_cate = cate_keywords[pred[3]]

        curr_matches = set(desc) & set(curr_cate)
        alt_matches = set(desc) & set(alt_cate)

        # const stands for construction --> consistently misclassified as IT
        if 'const' in desc:
            verified_predictions.append('1_facilities_and_construction')
        # ignoring defence due to wide scope
        elif len(curr_matches) < len(alt_matches) and pred[3] != '11_defence':
            verified_predictions.append(pred[3])
        else:
            verified_predictions.append(pred[2])
    else:
        verified_predictions.append(pred[2])

In [None]:
unclassified['category'] = verified_predictions

In [None]:
# putting combined dataframes into a temp dataframe bc combine_first rearranges column into alphabetical order???
temp_df = og_data.combine_first(unclassified)
temp_df.head()

In [None]:
og_data['category'] = temp_df['category']

og_data.iloc[18504]

In [None]:
# get all indices for rows classified using the classifier
autocat_index = []
for i, row in unclassified.iterrows():
    autocat_index.append(i)

In [None]:
# label classified rows in dataframe
og_data.insert(7,'auto_classified', 'False')
og_data.loc[autocat_index, 'auto_classified'] = True
    

In [None]:
og_data.to_csv("classified_contracts.csv")