In [1]:
## GLOBAL OPR NLP MODEL ##

In [2]:
%reset -f

import pandas as pd, numpy as np, re
import collections # for the Counter function
import sklearn.metrics as skm

randomseed = 1 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 10 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# ignore warnings (only if you are the kind that would code when the world is burning)
import warnings
warnings.filterwarnings('ignore')

# display wd files
import os as os
print('folder files: ', os.listdir('../input/'), '\n')
print('envir variables: ')
%who

folder files:  ['BP', 'career_velocity', 'COMP', 'Engagement', 'MOVEMENT', 'Navigate', 'OPR', 'org_chart_feats', 'PDI', 'Rule_based_module_output', 'SCRIPTS', 'TARGET'] 

envir variables: 



In [3]:
# HELPER FUNCTIONS CLASS #

# global function to flatten columns after a grouped operation and aggregation
# outside all classes since it is added as an attribute to pandas DataFrames
def __my_flatten_cols(self, how="_".join, reset_index=True):
    how = (lambda iter: list(iter)[-1]) if how == "last" else how
    self.columns = [how(filter(None, map(str, levels))) for levels in self.columns.values] \
    if isinstance(self.columns, pd.MultiIndex) else self.columns
    return self.reset_index(drop=True) if reset_index else self
pd.DataFrame.my_flatten_cols = __my_flatten_cols

class helper_funcs():

    def __init__(self):
        """ list down the various functions defined here """
    
    def csv_read(self, file_path, cols_to_keep=None, dtype=None, drop_dup=None):
        self.cols_to_keep = cols_to_keep
        if dtype is None:
            x=pd.read_csv(file_path, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'], encoding='latin-1', low_memory=False)
        else:
            x=pd.read_csv(file_path, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'], encoding='latin-1', low_memory=False, dtype=dtype)
        chars_to_remove = [' ', '.', '(', ')', '__', '-', '/', '\'', ':']
        for i in chars_to_remove:
            x.columns = x.columns.str.strip().str.lower().str.replace(i, '_')
        if cols_to_keep is not None: x = x[cols_to_keep]
        if drop_dup is not None: x.drop_duplicates(inplace=True)
        print(x.shape)
        return x
    
    def txt_read(self, file_path, cols_to_keep=None, sep='|', skiprows=1, dtype=None, drop_dup=None):
        # currently only supports salary files with the default values (need to implement dynamic programming for any generic txt)
        self.cols_to_keep = cols_to_keep
        if dtype is None:
            x=pd.read_table(file_path, sep=sep, skiprows=skiprows, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'])
        else:
            x=pd.read_table(file_path, sep=sep, skiprows=skiprows, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'], dtype=dtype)
        chars_to_remove = [' ', '.', '(', ')', '__', '-', '/', '\'', ':']
        for i in chars_to_remove:
            x.columns = x.columns.str.strip().str.lower().str.replace(i, '_')
        if cols_to_keep is not None: x = x[cols_to_keep]
        if drop_dup is not None: x.drop_duplicates(inplace=True)
        print(x.shape)
        return x

    def xlsx_read(self, file_path, cols_to_keep=None, sheet_name=0, dtype=None, drop_dup=None):
        self.cols_to_keep = cols_to_keep
        if dtype is None:
          x=pd.read_excel(file_path, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'], sheet_name=sheet_name)
        else:
          x=pd.read_excel(file_path, na_values=['No Data', ' ', 'UNKNOWN', '', 'Not Rated', 'Not Applicable'], sheet_name=sheet_name, dtype=dtype)
        chars_to_remove = [' ', '.', '(', ')', '__', '-', '/', '\'', ':']
        for i in chars_to_remove:
            x.columns = x.columns.str.strip().str.lower().str.replace(i, '_')
        if cols_to_keep is not None: x = x[cols_to_keep]
        if drop_dup is not None: x.drop_duplicates(inplace=True)
        print(x.shape)
        return x
    
    def process_columns(self, df, cols=None):
        if cols is None:
            df = df.apply(lambda x: x.str.lower() if (x.dtype == 'object') else x)
            df = df.apply(lambda x: x.str.strip() if (x.dtype == 'object') else x)
            df = df.apply(lambda x: x.str.replace('\s+|\s', '_', regex=True) if (x.dtype == 'object') else x)
            df = df.apply(lambda x: x.str.replace('[^\w+\s+]', '_', regex=True) if (x.dtype == 'object') else x)
            df = df.apply(lambda x: x.str.replace('\_+', '_', regex=True) if (x.dtype == 'object') else x)
        else:
            df = df.apply(lambda x: x.str.lower() if x.name in cols else x)
            df = df.apply(lambda x: x.str.strip() if x.name in cols else x)
            df = df.apply(lambda x: x.str.replace('\s+|\s', '_', regex=True) if x.name in cols else x)
            df = df.apply(lambda x: x.str.replace('[^\w+\s+]', '_', regex=True) if x.name in cols else x)
            df = df.apply(lambda x: x.str.replace('\_+', '_', regex=True) if x.name in cols else x)
        return df
  
    def nlp_process_columns(self, df, nlp_cols):
        df = df.apply(lambda x: x.str.replace('_', ' ') if x.name in nlp_cols else x)
        df = df.apply(lambda x: x.str.replace('\s+', ' ', regex=True) if x.name in nlp_cols else x)
        df = df.apply(lambda x: x.str.replace('crft', 'craft') if x.name in nlp_cols else x)
        return df
    
    def retrieve_name(var):
        """
        Gets the name of var. Does it from the out most frame inner-wards.
        :param var: variable to get name from.
        :return: string
        """
        for fi in reversed(inspect.stack()):
            names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
            if len(names) > 0:
                return names[0]

helpers = helper_funcs()

In [4]:
# ## read in the BP files

# bp_nlp_cols = ['company_name', 'contract_text', 'cost_center_description', 'functional_area_name', 'global_job_description', 
#                'inbev_description', 'inbev_entity_l2_desc', 'inbev_entity_l3_desc', 'inbev_entity_l4_desc', 'global_id',
#                'job_family_description', 'macro_entity_desc', 'macro_entity_l2_desc', 'macro_entity_l3_desc', 
#                'macro_entity_l4_desc', 'macro_entity_l5_desc', 'macro_entity_l6_desc', 'org_unit_description', 'year',
#                'pay_scale_area_text', 'pers_subarea_text', 'personnel_area_text', 'position_type_text', 'position_title', 
#                'position_title', 'direct_manager_position_desc', 'org_unit_description', 'global_job_description',
#               'payroll_area_text', 'local_entity_description', 'local_entity_l1_desc', 'local_entity_l2_desc', 
#                'local_entity_l3_desc', 'local_entity_4_desc', 'local_entity_l5_desc', 'local_entity_l6_desc']

# import pickle
# # load backup
# bp_files = open('../working/bp_backup.pkl', 'rb')
# bp_2016 = pickle.load(bp_files)
# bp_2017 = pickle.load(bp_files)
# bp_2018 = pickle.load(bp_files)
# bp_files.close()

# bp_2016 = bp_2016[bp_2016['employment_status']=='Active']
# bp_2017 = bp_2017[bp_2017['employment_status']=='Active']
# bp_2018 = bp_2018[bp_2018['employment_status']=='Active']

# # # merge the files and do some pre-processing on the columns
# bp_full = bp_2016.append(bp_2017, ignore_index=True)
# bp_full = bp_full.append(bp_2018, ignore_index=True)

# bp_full.drop_duplicates(subset=['global_id', 'year'], keep='last', inplace=True)
# bp_full = bp_full[bp_nlp_cols]

In [5]:
# # OPR

# # global dictionaries and lists
# dep_dict = {'4A': 5, '4B': 4, '3A': 3, '3B': 2, '1A': 1, '1B': 0}

# ## read the input opr files
# opr_2015 = helpers.csv_read(file_path='../input/OPR/global_opr_2015.csv', drop_dup='yes')
# opr_2016 = helpers.csv_read(file_path='../input/OPR/global_opr_2016.csv', drop_dup='yes')
# opr_2017 = helpers.csv_read(file_path='../input/OPR/global_opr_2017.csv', drop_dup='yes')
# opr_2018 = helpers.csv_read(file_path='../input/OPR/global_opr_2018.csv', drop_dup='yes')
# #opr_2018.head(2)

# ## pre-processing the input files and appending them. not dynamic since cadence/structure can change
# required_cols = ['employee_global_id', 'year', 'opr_rating_scale']
# opr_2015 = opr_2015[required_cols]
# opr_2016 = opr_2016[required_cols]
# opr_2017 = opr_2017[required_cols]
# opr_2018 = opr_2018[required_cols]

# ## create the full set
# opr_full = opr_2015.append(opr_2016, ignore_index=True)
# opr_full = opr_full.append(opr_2017, ignore_index=True)
# opr_full = opr_full.append(opr_2018, ignore_index=True)
# opr_full.columns = ['global_id', 'year', 'opr']
# opr_full.drop_duplicates(inplace=True, subset=['global_id', 'year'])
# opr_full.dropna(how='any', inplace=True)
# opr_full = opr_full[opr_full['year'] > 2014]
# opr_full.reset_index(inplace=True, drop=True)
# opr_full = opr_full[opr_full['opr']!='2']
# opr_full['opr'] = opr_full['opr'].map(dep_dict)
# opr_full.to_csv('../working/opr_full.csv', index=False)

# ## reshaping and creating the pivot version
# opr_reshaped = opr_full.pivot(index='global_id', columns='year', values=['opr']).reset_index().my_flatten_cols()
# opr_reshaped.columns.name = None
# #opr_reshaped.head(5)
# opr_reshaped[['opr_2015', 'opr_2016', 'opr_2017', 'opr_2018']] = opr_reshaped[['opr_2015', 'opr_2016', 'opr_2017', 'opr_2018']].apply(pd.to_numeric, errors='coerce')
# opr_reshaped = helpers.process_columns(df=opr_reshaped)

# ## split into train and valid
# opr_train = opr_reshaped.filter(regex='id|15|16|17')
# opr_train['year'] = 2017
# opr_train.columns = ['global_id', 'opr_prev_prev', 'opr_prev', 'response', 'year']
# opr_valid = opr_reshaped.filter(regex='id|16|17|18')
# opr_valid['year'] = 2018
# opr_valid.columns = ['global_id', 'opr_prev_prev', 'opr_prev', 'response', 'year']
# opr_train.dropna(subset=['global_id', 'response'], inplace=True)
# opr_valid.dropna(subset=['global_id', 'response'], inplace=True)
# opr_train.reset_index(drop=True, inplace=True)
# opr_valid.reset_index(drop=True, inplace=True)
# 'opr train shape is: ', opr_train.shape
# 'opr valid shape is: ', opr_valid.shape
# opr_train.head(2)

In [6]:
# nlp_opr_train = opr_train[['global_id', 'response', 'year']]
# nlp_opr_valid = opr_valid[['global_id', 'response', 'year']]
# nlp_opr_train.shape, nlp_opr_valid.shape

# nlp_opr_train = pd.merge(nlp_opr_train.reset_index(drop=True), bp_full, on=['global_id', 'year'], how='left')
# nlp_opr_valid = pd.merge(nlp_opr_valid.reset_index(drop=True), bp_full, on=['global_id', 'year'], how='left')
# nlp_opr_train.shape, nlp_opr_valid.shape

In [7]:
# # PDP data

# # load backup
# pdpbackup = open('../working/pdpbackup.pkl', 'rb')
# pdp16 = pickle.load(pdpbackup)
# pdp17 = pickle.load(pdpbackup)
# pdp18 = pickle.load(pdpbackup)
# pdpbackup.close()

# pdp17 = pdp17[['employee_global_id', 'organization_&_hierarchy', 'action', 'development_objective_title',
#               'employee_progress_comments', 'manager_progress_comments']]
# pdp18 = pdp18[['employee_global_id', 'organization_&_hierarchy', 'action', 'development_objective_title',
#               'employee_progress_comments', 'manager_progress_comments']]

In [8]:
# def combine_Columns_Into_New_Column(DF, columns_To_Combine, new_Column_Name):
#     DF[new_Column_Name] = ''
#     for Col in columns_To_Combine:
#         DF[new_Column_Name] += DF[Col].map(str) + ' '
#     DF = DF.drop(columns_To_Combine, axis=1)
#     DF = DF.groupby(by=['employee_global_id']).sum()
#     DF.reset_index(drop=False, inplace=True)
#     return DF

# def pdp_process(df):
#     dfagg = df[['employee_global_id', 'organization_&_hierarchy']].copy()
#     dfagg.drop_duplicates(inplace=True)
#     dfagg.fillna(value='', inplace=True)

#     dfagg2 = df[['employee_global_id', 'action', 'development_objective_title', 'employee_progress_comments', 'manager_progress_comments']].copy()
#     dfagg2.fillna(value='', inplace=True)
#     dfagg2 = combine_Columns_Into_New_Column(dfagg2, ['action', 'development_objective_title', 'employee_progress_comments', 'manager_progress_comments'], 
#                                                  'all_strings')
#     df = dfagg.merge(dfagg2, on='employee_global_id', how='outer')
#     df.columns = ['global_id', 'orghierarchy', 'allstrings']
#     return df

In [9]:
# pdp17full = pdp_process(pdp17)
# pdp18full = pdp_process(pdp18)

In [10]:
# nlp_opr_train = nlp_opr_train.merge(pdp17full, on='global_id', how='left')
# nlp_opr_valid = nlp_opr_valid.merge(pdp18full, on='global_id', how='left')

# nlp_opr_train_ids = nlp_opr_train.global_id
# nlp_opr_valid_ids = nlp_opr_valid.global_id

# nlp_opr_train.drop(columns=['global_id', 'year'], inplace=True)
# nlp_opr_valid.drop(columns=['global_id', 'year'], inplace=True)

# text_cols = nlp_opr_train.columns
# text_cols = text_cols.delete(0)

In [11]:
# # input/ouput files

# nlp_opr_train['string_all'] = nlp_opr_train[text_cols].apply(lambda x: ' '.join(x.dropna()), axis=1)
# nlp_opr_valid['string_all'] = nlp_opr_valid[text_cols].apply(lambda x: ' '.join(x.dropna()), axis=1)
# y_train = nlp_opr_train[['response']]
# y_test = nlp_opr_valid[['response']]
# y_train = y_train.astype(int)
# y_test = y_test.astype(int)
# train_text = nlp_opr_train[['string_all']]
# test_text = nlp_opr_valid[['string_all']]
# train_text = train_text.replace('\n','', regex=True)
# test_text = test_text.replace('\n','', regex=True)
# train_text = helpers.nlp_process_columns(train_text, nlp_cols=['string_all'])
# test_text = helpers.nlp_process_columns(test_text, nlp_cols=['string_all'])

# train_final = pd.concat([y_train.reset_index(drop=True), train_text.reset_index(drop=True)], axis=1)
# test_final = pd.concat([y_test.reset_index(drop=True), test_text.reset_index(drop=True)], axis=1)

In [12]:
# # save backup
# # nlp_pickle = open('../working/nlppickle.pkl','wb')
# # pickle.dump(train_final, nlp_pickle)
# # pickle.dump(test_final, nlp_pickle)
# # nlp_pickle.close()

# # # load backup
# nlp_pickle = open('../working/nlppickle.pkl', 'rb')
# train_final = pickle.load(nlp_pickle)
# test_final = pickle.load(nlp_pickle)
# nlp_pickle.close()

In [13]:
# import nltk
# #nltk.download('stopwords')

# def clean_text(string):
#     string = re.sub(r'['+string.punctuation+']', ' ', string)
#     string = re.sub(r'[0-9]+', ' ', string)
#     string = re.sub('\s+', ' ', string).strip()
#     string = string.lower()
    
# from nltk.corpus import stopwords
# stop = stopwords.words('english')

# train_final['stopwords'] = train_final['string_all'].apply(lambda x: len([x for x in x.split() if x in stop]))
# train_final[['string_all','stopwords']].head()

In [36]:
## comp comments

train = helpers.csv_read('../input/Navigate/comp_appraisal_comments/train_filtered.csv')
valid = helpers.csv_read('../input/Navigate/comp_appraisal_comments/valid_filtered.csv')

ytrain = np.array(train.response)
yvalid = np.array(valid.response)

train.head(2)

(3549, 3)
(3908, 3)


Unnamed: 0,global_id,comments,response
0,1001929,communication across the departments to elimin...,0
1,1002651,hoj great owner of the ABI culture hoj high le...,2


In [37]:
train['string'] = '__label__' + train['response'].astype('str') + ' ' + train['comments']
valid['string'] = '__label__' + valid['response'].astype('str') + ' ' + valid['comments']

train.drop(['global_id', 'response', 'comments'], inplace=True, axis=1)
valid.drop(['global_id', 'response', 'comments'], inplace=True, axis=1)

In [38]:
train.to_csv('train.txt', encoding='utf-8', index=False, header=False, sep='\t')
valid.to_csv('test.txt', encoding='utf-8', index=False, header=False, sep='\t')

In [39]:
train.head(2)

Unnamed: 0,string
0,__label__0 communication across the department...
1,__label__2 hoj great owner of the ABI culture ...


In [24]:
from fastText import train_supervised

def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [123]:
model = train_supervised(input = 'train.txt', epoch=100, lr=1, wordNgrams=5, 
                         verbose=10, minCount=10, loss='softmax', minn=5, dim=50, ws=10)

In [124]:
print_results(*model.test('test.txt', k=6))

N	3908
P@1	0.167
R@1	1.000


In [125]:
test_len = valid.shape[0]
predict_list = {}
for i in range(test_len):
    x=model.predict(valid.iloc[i][0], k=6)
    y=pd.DataFrame(list(x))
    y.columns=y.iloc[0]
    y=y.reindex(y.index.drop(0))
    predict_list[i] = y

In [126]:
pred = pd.concat(predict_list.values(), ignore_index=True)

In [127]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [128]:
pred = pred.apply(pd.to_numeric) # convert all columns of DataFrame
#minmax = StandardScaler()
#pred=pd.DataFrame(minmax.fit_transform(pred))
pred['label'] = pred.idxmax(axis=1)
pred = pred.replace('\_\_label\_\_','', regex=True)
pred['label'] = pred['label'].astype(int)
pred.head(10)

Unnamed: 0,__label__0,__label__1,__label__2,__label__3,__label__4,__label__5,label
0,0.000601,0.094422,0.803385,0.087291,0.013991,0.00037,2
1,0.002893,0.035064,0.827874,0.100877,0.030609,0.002743,2
2,2.9e-05,0.013256,0.221778,0.763705,0.00069,0.000602,3
3,0.000418,0.076031,0.92219,0.001369,3.8e-05,1.4e-05,2
4,0.015996,0.084058,0.51928,0.193349,0.169685,0.017692,2
5,0.00354,0.297013,0.677778,0.019855,0.001496,0.000377,2
6,0.00252,0.027596,0.401129,0.461924,0.095287,0.011604,3
7,0.000453,0.051684,0.943123,0.004557,0.000216,2.8e-05,2
8,8.4e-05,0.003114,0.991873,0.004719,0.000257,1.2e-05,2
9,2.8e-05,0.013631,0.957588,0.028489,0.000309,1.5e-05,2


In [129]:
yvalid[0:10]

array([1, 2, 2, 2, 0, 1, 2, 2, 2, 1], dtype=int64)

In [132]:
pred['actuals'] = yvalid
pred.to_csv('nlppred.csv')

In [130]:
skm.accuracy_score(y_pred=pred.label, y_true=yvalid)

0.44779938587512796

In [131]:
skm.confusion_matrix(y_pred=pred.label, y_true=yvalid)

array([[   0,    0,  101,   26,    4,    0],
       [   0,    0,  187,   48,    0,    0],
       [   1,    5, 1324,  366,   26,    1],
       [   0,    5,  671,  410,   34,    1],
       [   0,    1,  287,  226,   16,    0],
       [   0,    2,   86,   76,    4,    0]], dtype=int64)

In [133]:
pred.head(2)

Unnamed: 0,__label__0,__label__1,__label__2,__label__3,__label__4,__label__5,label,actuals
0,0.000601,0.094422,0.803385,0.087291,0.013991,0.00037,2,1
1,0.002893,0.035064,0.827874,0.100877,0.030609,0.002743,2,2


In [147]:
probs = pred.iloc[:, 0:6]
predictions = pred.label
yvalid = pred.actuals

In [156]:
## shape module

import functools

class opr_shape():
    """ precedence => 4A -> 4B -> 1B -> 1A -> 3A -> 3B (5 > 4 > 0 > 1 > 3 > 2) """
    
    def __init__(self, probs, ytrain):
        self.probs = probs
        self.ytrain = ytrain
        self.shapevec()
        self.main()
        
    def shapevec(self):
        shape_vec = pd.DataFrame(self.ytrain.value_counts()/self.ytrain.shape[0])
        shape_vec['class'] = shape_vec.index
        shape_vec.sort_values('class', inplace=True)
        #shape_vec['response'] = shape_vec.response.cumsum()
        #print('the shape vector is: ', shape_vec)
        self.shape_vec = shape_vec
        return None

    def main(self):
        self.probs_df = {}
        for i in range(self.probs.shape[1]):
            self.probs_df[i] = pd.DataFrame({str('prob_'+str(i)): self.probs.iloc[:, i]})
            self.probs_df[i]['true_index'] = self.probs_df[i].index
            self.probs_df[i].sort_values(by=[str('prob_'+str(i))], inplace=True, kind='mergesort', ascending=False)
            self.probs_df[i].reset_index(inplace=True, drop=True)
            self.probs_df[i][str(str(i)+'_new_index')] = self.probs_df[i].index
            self.probs_df[i][str(str(i)+'_index_perc')] = self.probs_df[i][str(str(i)+'_new_index')].rank(pct=True)
            self.probs_df[i][str(str(i)+'_flag')] = np.where(self.probs_df[i][str(str(i)+'_index_perc')]<self.shape_vec.iloc[i, 0], 1, 0)
            
        merge = functools.partial(pd.merge, left_index=False, right_index=False, how='inner', on='true_index')
        finaldf = functools.reduce(merge, self.probs_df.values())
        
        finaldf['class'] = np.where(finaldf['5_flag']==1, 5, np.where(finaldf['4_flag']==1, 4, np.where(finaldf['0_flag']==1, 0, 
                            np.where(finaldf['1_flag']==1, 1, np.where(finaldf['2_flag']==1, 2, 3)))))
        self.finaldf = finaldf
        return None

oprs = opr_shape(probs=probs, ytrain=yvalid)

In [160]:
shaped_df = oprs.finaldf[['true_index', 'class']]
shaped_df.set_index('true_index', inplace=True)
shaped_df.sort_index(inplace=True)
shaped_df.reset_index(drop=True, inplace=True)
skm.accuracy_score(y_true=yvalid, y_pred=shaped_df['class'])
skm.confusion_matrix(y_true=yvalid, y_pred=shaped_df['class'])
shaped_df.columns = ['shape_class']

0.38792221084953943

array([[  7,  13,  57,  32,  13,   9],
       [  7,  22, 112,  69,  19,   6],
       [ 32,  86, 872, 523, 161,  49],
       [ 13,  20, 364, 517, 145,  62],
       [  8,  16, 141, 242,  90,  33],
       [  2,   7,  42,  82,  27,   8]], dtype=int64)

In [161]:


pred_df1 = pd.DataFrame(probs)
pred_df1['score'] = (pred_df1[0]*1)+(pred_df1[1]*2)+(pred_df1[2]*3)+(pred_df1[3]*4)+(pred_df1[4]*5)+(pred_df1[5]*6)
pred_df1['true_index'] = pred_df1.index
pred_df1.sort_values('score', inplace=True)
pred_df1.reset_index(inplace=True, drop=True)
pred_df1=pred_df1[['score', 'true_index']]
pred_df1['new_index'] = (pred_df1.index+1)/pred_df1.shape[0]
pred_df1['class'] = np.where(pred_df1['new_index']<0.031877, 0, 
                             (np.where(pred_df1['new_index']<0.100751, 1, 
                                       (np.where(pred_df1['new_index']<0.582359, 2, 
                                                 (np.where(pred_df1['new_index']<0.872194, 3, 
                                                           (np.where(pred_df1['new_index']<0.970513, 4, 5)))))))))
dummy_df = pred_df1.copy()
dummy_df = dummy_df[['true_index', 'class']]
dummy_df.set_index(keys='true_index', inplace=True)
dummy_df.sort_index(inplace=True)
dummy_df.reset_index(inplace=True, drop=True)

x1=dummy_df['class']
sklearn.metrics.accuracy_score(y_true=yvalid, y_pred=x1)
sklearn.metrics.confusion_matrix(y_pred=x1, y_true=yvalid)

NameError: name 'pred_probs1' is not defined