In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_curve                # Calculate the ROC curve
from sklearn.metrics import precision_recall_curve   # Calculate the Precision-Recall curve
from sklearn.metrics import f1_score                 # Calculate the F-score
import re
from collections import Counter
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import os 
import time
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout,Flatten,Input,concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import tensorflow as tf
import kerastuner as kt
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import shutil
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


# Import Data

In [None]:




THRESHOLD = 15
LABEL_DATA= r''
REG_DATA= r''
COST_DATA = r''

# 


df=pd.read_csv(LABEL_DATA)
df_regr =pd.read_csv(REG_DATA)

#shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(subset = ["Description"], inplace=True)


#convert nan to 0
df_regr=df_regr.fillna(0)

df_short_desc=df['Short_Description']
df_regr.drop(['Description'],axis=1,inplace=True)

#merge to ensure the our datasets have matching entries. 
df_regr = pd.merge(df[['InvtID','Description']],
                 df_regr,
                 on='InvtID', 
                 how='left')

#data not needed for this task. 
df_regr.drop(['ID','InvtID','Date', 'Code','Job_Number','Year'],axis=1,inplace=True)
df.drop(['InvtID'],axis=1,inplace=True)




#

description_category = df
description_category['Description']= description_category['Description'].astype(str)
freq_plot = pd.DataFrame()
freq_plot['cat'] = description_category.columns[1:]
freq_plot['count'] = description_category.iloc[:,1:].sum().values
freq_plot.sort_values(['count'], inplace=True, ascending=False)
freq_plot.reset_index(inplace=True, drop=True)





main_categories = pd.DataFrame()
print(freq_plot.shape)
#we remove components below a certain threshold as they are rarely used.
main_categories = freq_plot[(freq_plot['count']>THRESHOLD)]
print(main_categories.shape)
#categories has a list of cats above threshold 
categories = main_categories['cat'].values

not_category = []
# description_category['Hardware'] = 0

#iterate through list of columns and add counts for condesned group
for i in description_category.columns[1:]:
    if i not in categories:
        #description_category['Hardware'][description_category[i] == 1] = 1
        not_category.append(i)

description_category.drop(not_category, axis=1, inplace=True)







most_common_cat = pd.DataFrame()
most_common_cat['cat'] = description_category.columns[1:]
most_common_cat['count'] = description_category.iloc[:,1:].sum().values
most_common_cat.sort_values(['count'], inplace=True, ascending=False)
most_common_cat.reset_index(inplace=True, drop=True)
print(most_common_cat.tail(50))




plt.figure(figsize=(24,8))
sns.set(font_scale = 1.5)
sns.set_style('whitegrid')



pal = sns.color_palette("flare", len(most_common_cat))
rank = most_common_cat['count'].argsort().argsort()

sns.barplot(most_common_cat['cat'], most_common_cat['count'], palette=np.array(pal[::-1])[rank])
plt.axhline(THRESHOLD, ls='--', c='red')
plt.title("Component frequency", fontsize=24)
plt.ylabel('Number of jobs', fontsize=18)
plt.xlabel('components', fontsize=18)
plt.xticks(rotation='vertical',size = 5)

plt.show()



# Data manipulation and cleanining

In [None]:

#Scale regression values
df_regr = df_regr[df.columns[1:]]
print(df_regr.describe())
x = df_regr.values #returns a numpy array
standard_scaler = preprocessing.StandardScaler()
x_scaled = standard_scaler.fit_transform(x)
df_regr = pd.DataFrame(x_scaled,columns= df_regr.columns)
print(df_regr.describe())
print(df_regr.head(20))






def cleanPunc(sentence): #function to clean the word of any punctuation or special characters this code could be optimized
    cleaned = re.sub(r'[?|!|\'|"|#|_|\*|{|}|##|`|~|@|:]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/|\n|\t|\r|]',r' ',cleaned)
    cleaned = re.sub(' +', ' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("  "," ")
    
    return cleaned



description_category['Description'] = description_category['Description'].apply(cleanPunc)
df_short_desc=df_short_desc.apply(cleanPunc)





# TF-IDF and Tokenization models

In [None]:

def tfidf_features(X_train):


    tfidf_vectorizer = TfidfVectorizer(use_idf=True,
                                       ngram_range=(1,3),
                                       analyzer='word',
                                       stop_words='english',
                                        max_features = 4995
                                           
                                       )
    
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  
    return X_train_tfidf.toarray(), tfidf_vectorizer.vocabulary_ ,tfidf_vectorizer



tokenizer = Tokenizer(num_words=2855, lower=True)
tokenizer.fit_on_texts(df_short_desc)
tokenized_short_description = tokenizer.texts_to_sequences(df_short_desc)
long_description = description_category['Description'].to_list()
max_len_short_description=1200
x =long_description
x2 = pad_sequences(tokenized_short_description, maxlen=max_len_short_description)



seeds = [1, 42, 79, 121, 172]



In [None]:
#Splitting functions

In [None]:

X_train = x
X2_train = x2
y_train = df_regr



def format_and_reshape(X2_train,X2_test,max_len_short_description):
    X2_train= [l.tolist() for l in X2_train]
    X2_train=np.concatenate( X2_train, axis=0 )
    X2_train=X2_train.reshape(-1,max_len_short_description)

    X2_test= [l.tolist() for l in X2_test]
    X2_test=np.concatenate( X2_test, axis=0 )
    X2_test=X2_test.reshape(-1,max_len_short_description)
    return X2_train,X2_test


X_train, tfidf_vocab, tfidf_transformer = tfidf_features(X_train)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}


 
max_words_short_desc=len(tokenizer.word_index) + 1
num_classes = y_train.shape[1]
max_words_long_desc = len(tfidf_vocab)




# Model

In [None]:
#determine parameters for regression





callbacks = [
# tensorboard_cb,
ReduceLROnPlateau(),
EarlyStopping(patience=10),
ModelCheckpoint(filepath=r'{}-model-simple.h5'.format("reg_opt"), save_best_only=True)
]


def custom_mse(class_weights):
    def cost_loss(y_true, y_pred):
        diff = (y_pred - y_true) * class_weights
        mse = K.mean(K.abs(diff), axis=-1)
 
        return mse
    return cost_loss



def custom_metric(class_weights):
    def cost_loss(y_true, y_pred):
        diff = K.abs(tf.math.subtract(y_pred,y_true))
        diff = tf.math.multiply(diff,class_weights)
 
        print(class_weights)
        true_cost= K.sum(K.abs(tf.math.multiply(y_true,class_weights)))
#         percent_cost_error= tf.math.divide_no_nan(diff,K.abs(y_true))
        mse = K.sum(diff, axis=-1)
        mse = tf.math.divide_no_nan(mse,true_cost)
 
        return mse
    return cost_loss



df_costs=df_costs[categories]

cost_scaler= MinMaxScaler(feature_range=(0,10))
cost_arr = cost_scaler.fit_transform(df_costs.to_numpy().reshape(-1,1) )
print(cost_arr.shape)
print(cost_arr)

list1 = cost_arr.flatten().tolist()
print(list1)
class_weights_1 = K.variable(list1)




df_costs=pd.read_csv(COST_DATA)
df_costs=df_costs[categories]
minimum= min(df_costs.values.tolist()[0])
maximum= max(df_costs.values.tolist()[0])
cost_scaler= MinMaxScaler(feature_range=(minimum,maximum))
cost_arr = cost_scaler.fit_transform(df_costs.to_numpy().reshape(-1,1) )
print(cost_arr.shape)
print(cost_arr)

list1 = cost_arr.flatten().tolist()
print(list1)
class_weights_2 = K.variable(list1)




target_metric= 'cost_loss'
target_metric = 'cost_loss'
#or cost_loss
def build_model_regression_only(hp):

 
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1,1e-2,1e-3, 1e-4,1e-5,1e-6])
    hp_l1 = hp.Choice('l1', values=[1e-3, 1e-4, 1e-5])
    hp_l2 = hp.Choice('l2', values=[1e-3, 1e-4, 1e-5])
    dropout = hp.Choice('dr', values=[0.1,0.2])
    dropout_2 = hp.Choice('dr_2', values=[0.0,0.1,0.2])
    number_of_layers = hp.Int('num_layers',min_value= 1, max_value=10)

    hp_units = hp.Int('units', min_value=1024, max_value=4096, step=64)
    hp_units_2 = hp.Int('units_2', min_value=184, max_value=512, step=64)
    


    input1 = Input(shape=(max_len_short_description,),name='short_desc')
    input2 = Input(shape=(max_words_long_desc,),name='long_desc')


    y2 = Dense(int(max_words_long_desc), input_shape=(max_words_long_desc,),kernel_initializer='he_normal')(input2)
    y2 = Dense(int(max_words_long_desc),kernel_initializer='he_normal')(y2)
    y2 = Dense(int(max_words_long_desc),kernel_initializer='he_normal')(y2)

    y1= Dense(hp_units_2, activation='relu',kernel_initializer='he_normal')(input1)
    y1= Dense(hp_units_2, activation='relu',kernel_initializer='he_normal')(y1)
    y1= Dense(hp_units_2, activation='relu',kernel_initializer='he_normal')(y1)

    y = concatenate([y1, y2])


    for i in range(number_of_layers):
        y= Dense(hp_units, activation='relu',kernel_initializer='he_normal',kernel_regularizer=tf.keras.regularizers.L1L2(l1=hp_l1,l2=hp_l2),name="Dense_{}".format(i+100))(y)
        y=Dropout(dropout_2)(y)
    y=Dropout(dropout)(y)
    y = Dense(num_classes,activation='linear',name="Dense_linear_{}".format(300))(y)
    model = Model(inputs=[input1, input2], outputs=y)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), 
                loss= custom_mse(class_weights_1),
                metrics=[custom_metric(class_weights_2)]
                )

   


    

    return model






tuner_reg = kt.Hyperband(build_model_regression_only,
                     objective=kt.Objective(target_metric, direction="min"),
                     max_epochs=10,
                     directory='my_dir_3',
                     project_name='intro_to_kt_3')

stop_early = tf.keras.callbacks.EarlyStopping(monitor=target_metric, patience=5)
tuner_reg.search([X2_train,X_train], y_train, 
             epochs=50, 
             validation_split=0.2, 
             callbacks=[stop_early]
#              class_weight=class_weight
                )


best_hps=tuner_reg.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')} 
l1 {best_hps.get('l1')} 
l2 {best_hps.get('l2')} 
dr {best_hps.get('dr')}
dr_2 {best_hps.get('dr_2')}
units_2 {best_hps.get('units_2')}
num_layers {best_hps.get('num_layers')}

.
""")




In [None]:
def Average(lst):
    return sum(lst) / len(lst)








#CV evaluation 
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)

zip_cv = list(zip(X_train,X2_train))
print(len(zip_cv))

fold_no = 1
average_score_reg_only=[]
for train, test in kfold.split(zip_cv, y_train):


    X2_train_cv= X2_train[train]
    X_train_cv = X_train[train]
    Y_train_cv = y_train.to_numpy()[train]
    x_values = [X2_train[train],X_train[train]]
    model_reg_only = tuner_reg.hypermodel.build(best_hps)
    plot_model(model_reg_only, to_file='model.png')
    history_reg_only = model_reg_only.fit([X2_train_cv,X_train_cv],Y_train_cv, epochs=50,verbose=0)
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')
    val_acc_per_epoch = history_reg_only.history[target_metric]
    best_epoch = val_acc_per_epoch.index(min(val_acc_per_epoch)) + 1
    print('Best epoch: %d' % (best_epoch,))
    callbacks = [
    ReduceLROnPlateau(monitor=target_metric),
    EarlyStopping(patience=10,monitor=target_metric),
    ModelCheckpoint(filepath=r'{}-model-simple.h5'.format("sig_opt_final"),monitor=target_metric, save_best_only=True)
    ]
    hypermodel = tuner_reg.hypermodel.build(best_hps)
    history = hypermodel.fit([X2_train_cv,X_train_cv], Y_train_cv, epochs=best_epoch, callbacks=callbacks,verbose=0)

    
    
    # Generate generalization metrics
    scores = hypermodel.evaluate([X2_train[test],X_train[test]], y_train.to_numpy()[test], verbose=0)
    print(f'Score for fold {fold_no}: {model_reg_only.metrics_names[0]} of {scores[0]}; {model_reg_only.metrics_names[1]} of {scores[1]*100}%')


    # Increase fold number
    fold_no = fold_no + 1
    average_score_reg_only.append(scores[1]*100)
    
    import matplotlib.pyplot as plt
    acc = history.history[target_metric]
    
    loss = history.history['loss']

    epochs = range(len(acc))

    plt.plot(epochs, acc, 'b', label='Training MAE')
    plt.title('Training WMAE')

    plt.figure()


print(Average(average_score_reg_only))  