### Prediction using Final stack model(XGB+LGBM)

In [1]:
import warnings
import pickle
import os
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
from datetime import datetime
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  roc_auc_score ,roc_curve,auc
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV
import joblib

In [11]:
def final_fun_1(X):
    #dropping column list from train & test
    
    with open ('dropped_columns', 'rb') as fp:
        drop_columns = pickle.load(fp) #implicit file closing

    #loading train data
    train_set5=pd.read_csv("train.csv")
    y_train_final=train_set5['QuoteConversion_Flag']
    train_set5=train_set5.drop(drop_columns,axis=1)
    train_set5 = train_set5.drop(['QuoteConversion_Flag'],axis=1)


    test_set5=X.copy()
    test_set5=test_set5.drop(drop_columns,axis=1)
    
    Final_prediction=pd.read_csv("sample_submission.csv",nrows=test_set5.shape[0],index_col=False)


    #calling the functions of preprocessing
    def converting_numeric(df):
      """ This function is to convert the categorical features which are having only Y & N as the categories 
      in the columns can be converted into numerical values as 1 & 0 respectively """
      cat_features=[i for i in df.columns if df.dtypes[i]=='object']

      #from the dataset it is observed that colum "GeographicField63" is having NULL value in the unique set of values along with Y & N
      filtered_cat=[]
      for i in cat_features:
        filter_1=dict(df[i].value_counts()) #gives the distinct values
        if len(filter_1)==2 or set(i for i in filter_1.keys() if i!= " ")==set(('N','Y')): #here column having only Y & N are filtered out
          filtered_cat.append(i)

            
      print("{} categorical varibales are converted into numerical features out of {} categorical variables".format(len(filtered_cat),len(cat_features)))

      df_1=pd.DataFrame([])
      for i in filtered_cat:
        df_1[i]=df[i].map(lambda x: 0 if x== 'N' else(1 if x=='Y' else x)) #writing elif in lambda function from ref
      return df_1,filtered_cat


    #https://stackoverflow.com/questions/39461328/in-pandas-what-does-the-na-action-parameter-to-series-map-do
    def removal_of_empty_if(df_1):
      """converting empty values to None so that this will fall into int64 datatype 
         This can be used for the column which have empty values making the column as string eventhough the rest of the values are integer"""
      if df_1.dtype=='O':
          df_trial=df_1.map(lambda x: int(float(x)) if x!=' ' else None ,na_action=None) 
          #now check the converted 
          if df_trial.dtype=='int64' or df_trial.dtype=='float64':
            print("Empty fields to None converted columns are: {} ".format(df_trial.name))
            return df_trial
          else:
            print("\nThe given column has other categories than empty value")
      else:
         print("The given column '{}' is already in integer datatype".format(df_1.name))
         return df_1


    def comma_removal(column_comma):
      """converting catergorical feature into numerical. As it is having comma for 1000position of number in the column values """
      print("\nThe current datatype of column {}:".format(column_comma.name), column_comma.dtype)
      if column_comma.dtype=='O': #checks if the datatype is object then perform the comma removal
        column_comma=column_comma.apply(lambda x: int(str(x).replace(',',"")))
        print("The datatype after conversion of column {}:".format(column_comma.name) ,column_comma.dtype)
        return column_comma
      else:
        return column_comma

    def extract_date_feat(df_1):
      """ Converting string values to datetime64 datatype to extract new features from Original_Quote_Date column
          strftime is used to convert the date into string  """
      df_1["date"]=df_1['Original_Quote_Date'].map(lambda i: int(i.strftime("%d")))
      df_1["weekday_name"]=df_1['Original_Quote_Date'].map(lambda i : i.strftime("%A"))
      df_1["month"]=df_1['Original_Quote_Date'].map(lambda i : int(i.strftime("%m")))
      df_1["year"]=df_1['Original_Quote_Date'].map(lambda i : int(i.strftime("%Y")))

      return df_1

    df_numeric,filtered_cat=converting_numeric(train_set5)
    train_set5=train_set5.drop(filtered_cat, axis=1)
    train_set5=pd.concat([train_set5,df_numeric],axis=1)
    
    df_numeric,filtered_cat=converting_numeric(test_set5)
    test_set5=test_set5.drop(filtered_cat, axis=1)
    test_set5=pd.concat([test_set5,df_numeric],axis=1)
   

    cat_features=[i for i in train_set5.columns if train_set5.dtypes[i]=='object']
    numerical_features=list(set(train_set5.columns)-set(cat_features))

    cat_features_test=[i for i in test_set5.columns if test_set5.dtypes[i]=='object']
    numerical_features_test=list(set(test_set5.columns)-set(cat_features))
    
    null_int_test_col=set(cat_features_test).difference(cat_features)
    
    print("\nThe columns which has null in test data but not null in train data is present as integer column",null_int_test_col,"\n")


    df_none=removal_of_empty_if (train_set5['GeographicField63'])
    if df_none.dtype=='int64' or df_none.dtype=='float64':
      train_set5=train_set5.drop(['GeographicField63'],axis=1) #removing the string column
      train_set5['GeographicField63']=df_none
    else:
      print("\n undrop the column")
    #test data
    df_none=removal_of_empty_if (test_set5['GeographicField63'])
    if df_none.dtype=='int64' or df_none.dtype=='float64':
      test_set5=test_set5.drop(['GeographicField63'],axis=1) #removing the string column
      test_set5['GeographicField63']=df_none
    else:
      print("\n undrop the column")

    if len(null_int_test_col)>0:
        for i in null_int_test_col:
            df_none_t=removal_of_empty_if (test_set5[i])
            if df_none_t.dtype=='int64' or df_none.dtype=='float64':
              test_set5=test_set5.drop([i],axis=1) #removing the string column
              test_set5[i]=df_none_t
            else:
              print("\n undrop the column")


    #Updating the "Field10" column by removing the comma at 1000's places and converting into integer
    train_set5["Field10"] = comma_removal(train_set5["Field10"])
    test_set5["Field10"] = comma_removal(test_set5["Field10"])

    cat_features=[i for i in train_set5.columns if train_set5.dtypes[i]=='object']
    numerical_features=list(set(train_set5.columns)-set(cat_features))

    #https://stackoverflow.com/questions/34148815/check-if-a-pandas-series-has-at-least-one-item-greater-than-a-value
    std_threshold =25
    std_columns=[]
    for i in numerical_features:
      if (train_set5[i]>std_threshold).any():
        std_columns.append(i)
    print("\nThe features which are standardized",std_columns) 
   
    with open('standardize.pkl','rb') as std:
        scaler=pickle.load(std)
        
    scaled = scaler.transform(train_set5[std_columns])
    train_set5[std_columns]=scaled
    test_set5[std_columns]=scaler.transform(test_set5[std_columns]) #only transform is used for test data

    #calling extract date feature function
    #train data
    train_set5['Original_Quote_Date'] = pd.to_datetime(train_set5['Original_Quote_Date']) #converting to datetime
    train_set5=extract_date_feat(train_set5) #direct update of columns in the dataframe
    #test data
    test_set5['Original_Quote_Date'] = pd.to_datetime(test_set5['Original_Quote_Date']) #converting to datetime
    test_set5=extract_date_feat(test_set5) #direct update of columns in the dataframe

    train_set5=train_set5.drop(['Original_Quote_Date'],axis=1)
    test_set5=test_set5.drop(['Original_Quote_Date'],axis=1)

    train_null=dict(train_set5[train_set5.columns[train_set5.isnull().any()]].apply(lambda x: x.isna().sum()))
    test_null=dict(test_set5[test_set5.columns[test_set5.isnull().any()]].apply(lambda x: x.isna().sum()))

    #storing the columns having NULL values in seperate dataframe
    df_train=train_set5[train_null.keys()]
    df_test=test_set5[train_null.keys()] #using the same train NULL keys

    print("\nThe columns which undergoes for NaN imputation are:",train_null.keys())

    def capture_nan(df_2):
      """ Here new column is added which capture the presence or absence of value in the column"""
      df_new=pd.DataFrame([])
      for i,j in df_2.items():
        df_new[i+'_NAN']=np.where(df_2[i].isnull(),1,0)
      return df_new

    nan_result=capture_nan(df_train)
    nan_result_test=capture_nan(df_test)

    #Model-SET-2 NAN new column addition method
    X_train_set5=pd.concat([train_set5,nan_result],axis=1)
    X_test_set5 =pd.concat([test_set5,nan_result_test],axis=1)

    cat_features=[i for i in X_train_set5.columns if X_train_set5.dtypes[i]=='object']
    numerical_features=list(set(X_train_set5.columns)-set(cat_features))

    def only_transform(dict_1,df_test):
        from sklearn.preprocessing import OneHotEncoder
        #converting to dataframe

        df_test = pd.DataFrame(df_test)
        encoded_df_test=pd.DataFrame([]) 
        for i in df_test.columns:
          if i in dict_1.keys():
            df_test_encoded=dict_1[i].transform(df_test[i].values.reshape(-1, 1)).toarray()
            df_1_test=pd.DataFrame(df_test_encoded,columns=[ i+"_"+str(m) for m in range(df_test_encoded.shape[1])])

            encoded_df_test=pd.concat([encoded_df_test,df_1_test],axis=1) #concatening multiple columns
            df_1_test=pd.DataFrame([])

        return encoded_df_test
    
    #train OHE dictionary
    with open('OHE_fit.pkl','rb') as ohe:
        encoded_dict_1=pickle.load(ohe)

    train_data_cat_1=only_transform(encoded_dict_1,X_train_set5[cat_features])
    test_data_cat_1=only_transform(encoded_dict_1,X_test_set5[cat_features]) #transform 


    X_train_set5= pd.concat([X_train_set5[numerical_features].reset_index(drop=True),train_data_cat_1.reset_index(drop=True)],axis=1)
    X_test_set5=pd.concat([X_test_set5[numerical_features].reset_index(drop=True),test_data_cat_1.reset_index(drop=True)],axis=1)

    print("\nEmpty fields are filled with -99 value")
    X_train_set5=X_train_set5.fillna(-99)
    X_test_set5=X_test_set5.fillna(-99)

    print("\nDimension of train data after OHE:",X_train_set5.shape)
    print("Dimension of test data after OHE:",X_test_set5.shape)
    
    #XGB
    with open('xgb_fit_final.pkl','rb') as xgb_fit1:
        xgb_fit=pickle.load(xgb_fit1)
    
    #to get the feature names in the order of trained model from XGB
    column_names_xgb=xgb_fit.best_estimator_.get_booster().feature_names 
      
    predict_y_train_final_xgb = xgb_fit.predict_proba(X_train_set5[column_names_xgb])
    print("\nTraining data roc_auc score from XGB model is: ",roc_auc_score(y_train_final, predict_y_train_final_xgb[:,1]))
    predict_y_test_xgb = xgb_fit.predict_proba(X_test_set5[column_names_xgb])
    
    #LGBM
    with open('lgbm_fit_final.pkl','rb') as lgbm_fit1:
        lgbm_fit=pickle.load(lgbm_fit1)
        
    #to get the feature names in the order of trained model from XGB
    column_order_lgbm=lgbm_fit.booster_.feature_name()
    
    predict_y_train_final_lgbm = lgbm_fit.predict_proba(X_train_set5[column_order_lgbm])
    print("\nTraining data roc_auc score from LGBM model is: ",roc_auc_score(y_train_final, predict_y_train_final_lgbm[:,1]))
    predict_y_test_lgbm = lgbm_fit.predict_proba(X_test_set5[column_order_lgbm])
   
    #stacking the XGB & LGBM result in one array
    final_train_array=np.vstack([predict_y_train_final_xgb[:,1],predict_y_train_final_lgbm[:,1]]).T
    final_test_array=np.vstack([predict_y_test_xgb[:,1],predict_y_test_lgbm[:,1]]).T 
    
    #LR on stacked model result
    with open('LR_stack.pkl','rb') as stacklr:
        logisticR=pickle.load(stacklr)
    predict_prob_val_train_xg_lg55=logisticR.predict_proba(final_train_array) # XGB+LGBM only
    predict_prob_val_test_xg_lg55=logisticR.predict_proba(final_test_array)
    predicted_labels = logisticR.predict(final_test_array)
    print("\nTraining data roc_auc score from Stacked model is: ",roc_auc_score(y_train_final, predict_prob_val_train_xg_lg55[:,1]))
    print("Test results are predicted")
    
    Final_prediction['QuoteConversion_Flag']=predicted_labels
    
    return predict_prob_val_test_xg_lg55[:,1],Final_prediction

In [12]:
X_test_data=pd.read_csv("test.csv")
final_fun_1(X_test_data)

11 categorical varibales are converted into numerical features out of 20 categorical variables
11 categorical varibales are converted into numerical features out of 20 categorical variables

The columns which has null in test data but not null in train data is present as integer column {'PropertyField37'} 

Empty fields to None converted columns are: GeographicField63 
Empty fields to None converted columns are: GeographicField63 
Empty fields to None converted columns are: PropertyField37 

The current datatype of column Field10: object
The datatype after conversion of column Field10: int64

The current datatype of column Field10: object
The datatype after conversion of column Field10: int64

The features which are standardized ['Field10', 'SalesField12', 'SalesField11', 'Field7', 'PersonalField14']

The columns which undergoes for NaN imputation are: dict_keys(['PersonalField84', 'PersonalField7', 'PropertyField3', 'PropertyField4', 'PropertyField32', 'PropertyField34', 'PropertyFiel

(array([0.01323976, 0.01737577, 0.01757762, ..., 0.99427084, 0.01322205,
        0.09275605]),
         QuoteNumber  QuoteConversion_Flag
 0                 3                     0
 1                 5                     0
 2                 7                     0
 3                 9                     0
 4                10                     0
 ...             ...                   ...
 173831       434570                     0
 173832       434573                     0
 173833       434574                     1
 173834       434575                     0
 173835       434589                     0
 
 [173836 rows x 2 columns])

In [13]:
def final_fun_2(X,Y):
    """ Return the roc_auc score for the given input"""
    test_prediction,Predictions=final_fun_1(X)
    print("\nThe test ROC_AUC score is:")
    return roc_auc_score(Y,test_prediction)

In [14]:
X_train_input=pd.read_csv("train.csv")
y=X_train_input['QuoteConversion_Flag']
X_train_input=X_train_input.drop('QuoteConversion_Flag',axis=1)

In [15]:
# checking roc_auc score fir first 100 points from train data
final_fun_2(X_train_input[:100],y[:100])

11 categorical varibales are converted into numerical features out of 20 categorical variables
11 categorical varibales are converted into numerical features out of 20 categorical variables

The columns which has null in test data but not null in train data is present as integer column set() 

Empty fields to None converted columns are: GeographicField63 
The given column 'GeographicField63' is already in integer datatype

The current datatype of column Field10: object
The datatype after conversion of column Field10: int64

The current datatype of column Field10: object
The datatype after conversion of column Field10: int64

The features which are standardized ['Field10', 'SalesField12', 'SalesField11', 'Field7', 'PersonalField14']

The columns which undergoes for NaN imputation are: dict_keys(['PersonalField84', 'PersonalField7', 'PropertyField3', 'PropertyField4', 'PropertyField32', 'PropertyField34', 'PropertyField36', 'PropertyField38', 'GeographicField63'])
OHE dict {'CoverageFiel

0.9906759906759907

#### References

* https://github.com/dmlc/xgboost/issues/5275
* https://stackoverflow.com/questions/34952651/only-integers-slices-ellipsis-numpy-newaxis-none-and-intege
* https://stackoverflow.com/questions/42338972/valueerror-feature-names-mismatch-in-xgboost-in-the-predict-function
*https://github.com/dmlc/xgboost/issues/2334