IMPORT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import os
import mlflow 
import random
from matplotlib_venn import venn2
from sklearn.decomposition import TruncatedSVD,NMF,FactorAnalysis,KernelPCA,FastICA
import gc
sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')
import subprocess
cmd = "git rev-parse --short HEAD"
hash = subprocess.check_output(cmd.split()).strip().decode('utf-8')
_cmd = "git rev-parse --abbrev-ref HEAD"
branch =subprocess.check_output(_cmd.split()).strip().decode('utf-8')
branch = "-".join(branch.split("/"))
pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
class CFG:
    folds=5
    seed=1
    train=False
    ver=f'{branch}_{hash}'
    kfold='sk'
    model='lgb'
    n_components=10
    drop_col=[]
    cluster_num=4
    num_iterations=100000
def fix_seed(seed=CFG.seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
fix_seed()

In [3]:
OUTPUT_DIR = "./outputs/"
INPUT_DIR = "../data/"

In [4]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv('../data/test.csv')

In [5]:
def describe_satatistics(input_df):
    desc = pd.DataFrame(index=list(input_df))
    desc["nunique"] = train.nunique()
    desc["unique%"] = train.nunique() / len(input_df)
    desc["null"] = train.isnull().sum()
    desc["type"] = train.dtypes
    desc = pd.concat([desc,input_df.describe().T],axis=1)
    return desc

# Preprocessing

In [6]:
train_df = train.copy()
test_df = test.copy()

In [7]:
train["train_data"]=1
test["train_data"]=0
eda_df = pd.concat([train,test])
eda_df.reset_index(inplace=True,drop=True)

In [8]:
dataframe_list = [eda_df,train_df,test_df]

In [9]:
pd.set_option('display.max_columns', None)
eda_df.head()

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,train_data
0,0,163,21,1.0,0,0,1,N,N,31-Jan-98,1.0,0,22-Sep-06,2006,PHOENIX,AZ,SD,"$80,000.00","$80,000.00","$68,000.00",0,1
1,1,84,6,1.0,4,0,0,0,N,31-Oct-93,1.0,62,30-Jun-92,1992,MCALESTER,OK,OK,"$287,000.00","$287,000.00","$229,600.00",0,1
2,2,242,45,1.0,4,90,0,N,N,31-Aug-01,1.0,42,18-Apr-01,2001,HAWTHORNE,NJ,NJ,"$31,983.00","$30,000.00","$15,000.00",1,1
3,3,237,4,1.0,0,0,0,N,N,31-Aug-07,1.0,33,6-Oct-03,2004,NASHVILLE,TN,SD,"$229,000.00","$229,000.00","$229,000.00",0,1
4,4,184,0,1.0,0,0,0,N,N,8-Jun-83,1.0,0,17-Dec-99,2000,POMONA,CA,CA,"$525,000.00","$525,000.00","$393,750.00",0,1


In [10]:
eda_df_02 = describe_satatistics(eda_df)

In [11]:
eda_df_02

Unnamed: 0,nunique,unique%,null,type,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,42307,0.499994,0,int64,84615.0,42307.0,24426.390851,0.0,21153.5,42307.0,63460.5,84614.0
Term,228,0.002695,0,int64,84615.0,108.558447,84.731943,0.0,56.0,82.0,168.0,360.0
NoEmp,196,0.002316,0,int64,84615.0,9.69951,17.365667,0.0,2.0,4.0,12.0,208.0
NewExist,2,2.4e-05,0,float64,84615.0,1.209632,0.407048,1.0,1.0,1.0,1.0,2.0
CreateJob,49,0.000579,0,int64,84615.0,2.185144,5.112835,0.0,0.0,0.0,2.0,86.0
RetainedJob,83,0.000981,0,int64,84615.0,3.488897,8.124039,0.0,0.0,0.0,4.0,175.0
FranchiseCode,271,0.003203,0,int64,84615.0,1990.058843,10587.667836,0.0,0.0,0.0,1.0,90709.0
RevLineCr,4,4.7e-05,1079,object,,,,,,,,
LowDoc,6,7.1e-05,531,object,,,,,,,,
DisbursementDate,916,0.010826,150,object,,,,,,,,


In [12]:
def clean_date(input_df,col):
    input_df[col] = pd.to_datetime(input_df[col])
    return input_df

In [13]:
for c in dataframe_list:
    for a in ["DisbursementDate","ApprovalDate"]:
        c = clean_date(c,a)

  input_df[col] = pd.to_datetime(input_df[col])
  input_df[col] = pd.to_datetime(input_df[col])
  input_df[col] = pd.to_datetime(input_df[col])
  input_df[col] = pd.to_datetime(input_df[col])
  input_df[col] = pd.to_datetime(input_df[col])
  input_df[col] = pd.to_datetime(input_df[col])


In [14]:
le = LabelEncoder()
label_columns_list = ["RevLineCr","LowDoc","City","State","BankState"]
def label_encode(input_df):
    for col in label_columns_list:
        le = LabelEncoder()
        le.fit(input_df[col])
        input_df[f"{col}_label"] = le.transform(input_df[col])
    return input_df


In [15]:
def clean_money(input_df):
    money_columns = ["DisbursementGross","GrAppv","SBA_Appv"]
    for col in money_columns:
        input_df[col] = input_df[col].str.replace('$', '').replace(',', '')
        input_df[col] = input_df[col].str.replace(",","")
        input_df[col] = input_df[col].str.extract(r"(\d+)")
        input_df[col] = input_df[col].astype(int)
    return input_df

In [16]:
def create_NoEmpLabel(input_df):
    input_df["NoEmp_label"] = 0
    input_df["NoEmp_label"] = np.where(input_df["NoEmp"]<3,1,input_df["NoEmp_label"])
    input_df["NoEmp_label"] = np.where((input_df["NoEmp"]>=3) & (input_df["NoEmp"]<5),2,input_df["NoEmp_label"])
    input_df["NoEmp_label"] = np.where((input_df["NoEmp"]>=5 )& (input_df["NoEmp"]<13),3,input_df["NoEmp_label"])
    input_df["NoEmp_label"] = np.where(input_df["NoEmp"]>=13,4,input_df["NoEmp_label"])
    print(input_df["NoEmp_label"].value_counts())
    return input_df

In [17]:
for c in dataframe_list:
    c = clean_money(c)
    c = label_encode(c)
    c = create_NoEmpLabel(c)

NoEmp_label
1    25873
2    20864
4    20698
3    17180
Name: count, dtype: int64
NoEmp_label
1    12945
2    10465
4    10332
3     8565
Name: count, dtype: int64
NoEmp_label
1    12928
2    10399
4    10366
3     8615
Name: count, dtype: int64


In [20]:
population_df = pd.read_csv("../data/us_population.csv")

In [21]:
def create_popflag(input_df,pop_df,year):
    pop_df = pop_df.sort_values(f"{year}",ascending=False)
    pop_df[f"{year}_flag"] = pd.qcut(pop_df[f"{year}"],4,labels=False)
    pop_df[f"{year}_flag"] = pop_df[f"{year}_flag"]+1
    input_df = pd.merge(input_df, pop_df[[f"{year}_flag","State"]],how="left",on="State")

    return input_df[f"{year}_flag"]


In [25]:
train_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,RevLineCr_label,LowDoc_label,City_label,State_label,BankState_label,NoEmp_label
0,0,163,21,1.0,0,0,1,N,N,1998-01-31,1,0,2006-09-22,2006,PHOENIX,AZ,SD,80000,80000,68000,0,1,3,1857,3,41,4
1,1,84,6,1.0,4,0,0,0,N,1993-10-31,1,62,1992-06-30,1992,MCALESTER,OK,OK,287000,287000,229600,0,0,3,1451,36,36,3
2,2,242,45,1.0,4,90,0,N,N,2001-08-31,1,42,2001-04-18,2001,HAWTHORNE,NJ,NJ,31983,30000,15000,1,1,3,1021,31,31,4
3,3,237,4,1.0,0,0,0,N,N,2007-08-31,1,33,2003-10-06,2004,NASHVILLE,TN,SD,229000,229000,229000,0,1,3,1610,42,41,2
4,4,184,0,1.0,0,0,0,N,N,1983-06-08,1,0,1999-12-17,2000,POMONA,CA,CA,525000,525000,393750,0,1,3,1893,4,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,42302,283,14,1.0,0,0,1,N,N,1998-01-31,1,0,1995-03-02,1995,PHILADELPHIA,PA,PA,80000,80000,68000,0,1,3,1856,38,38,4
42303,42303,53,2,1.0,0,0,0,Y,N,1991-04-03,1,42,2007-06-06,2007,LOS ANGELES,CA,SD,5000,5000,4250,1,3,3,1349,4,41,1
42304,42304,59,6,2.0,0,0,1,N,N,2003-02-28,1,42,2003-03-14,2003,COLUMBUS,OH,OH,60000,60000,51000,0,1,3,485,35,35,3
42305,42305,295,18,1.0,0,8,0,N,N,1997-12-10,1,42,1989-08-23,1989,CLOQUET,MN,MN,294000,294000,220500,0,1,3,458,23,23,4


In [23]:
train_df["Country_flag"] = 0
train_df= train_df.mask(train_df["ApprovalFY"]<2000,train_df["ApprovalFY"].apply())

0        2006
1        1992
2        2001
3        2004
4        2000
         ... 
42302    1995
42303    2007
42304    2003
42305    1989
42306    2011
Name: ApprovalFY, Length: 42307, dtype: int64

In [161]:
def create_main_key(input_df,column):
    tmp_df = input_df.copy()
    tmp_df[f"main_key_{column}"] = tmp_df[f"{column}"].astype(str).str.cat(tmp_df["Sector"].astype(str), sep="_")
    count_df =pd.DataFrame(tmp_df[f"main_key_{column}"].value_counts())
    count_df.reset_index(inplace = True)
    top_key_list = count_df[f"main_key_{column}"].head(100)
    input_df[f"main_key_{column}"] = input_df[f"{column}"].astype(str).str.cat(input_df["Sector"].astype(str), sep="_")
    input_df[f"main_key_{column}"] = np.where(input_df[f"main_key_{column}"].isin(top_key_list),input_df[f"main_key_{column}"],"XX")
    input_df[f"main_key_{column}"].value_counts()
    return input_df

In [162]:
main_key_list = []
for i,dataFrame in enumerate(dataframe_list):
    for col in ["NoEmp_label","LowDoc_label","RevLineCr_label","State","City"]:
            dataFrame= create_main_key(dataFrame,col)
            if i == 0:
                main_key_list.append(f"main_key_{col}")
            print(col,"completed")

NoEmp_label completed
LowDoc_label completed
RevLineCr_label completed
State completed
City completed
NoEmp_label completed
LowDoc_label completed
RevLineCr_label completed
State completed
City completed
NoEmp_label completed
LowDoc_label completed
RevLineCr_label completed
State completed
City completed


In [163]:
main_key_list

['main_key_NoEmp_label',
 'main_key_LowDoc_label',
 'main_key_RevLineCr_label',
 'main_key_State',
 'main_key_City']

In [164]:
def create_groupby_features(input_df,target_col,group_feature):
    tmp_df =  input_df.groupby(group_feature)[target_col].agg(["median"])
    tmp_df.reset_index(inplace=True)
    tmp_df.rename(columns = {"median":f"{target_col}_{group_feature}_med"},inplace=True)
    if group_feature=='main_key':
        tmp_df[group_feature]=np.where(tmp_df[group_feature]=='XX','ZZZ',tmp_df[group_feature])
    input_df = pd.merge(input_df,tmp_df,how="left",on=group_feature)
    input_df[f"{target_col}_{group_feature}_diff"] = input_df[target_col] - input_df[f"{target_col}_{group_feature}_med"]
    print("completed")
    return input_df[[f"{target_col}_{group_feature}_med",f"{target_col}_{group_feature}_diff"]]

In [165]:
for c in dataframe_list:
    for key in main_key_list:
        for col in ["DisbursementGross","GrAppv","SBA_Appv","CreateJob","RetainedJob","NoEmp","Term"]:
            c[[f"{col}_{key}_med",f"{col}_{key}_diff"]] = create_groupby_features(c,col,key)


completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed


In [166]:
eda_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,train_data,RevLineCr_label,LowDoc_label,City_label,State_label,BankState_label,NoEmp_label,main_key_NoEmp_label,main_key_LowDoc_label,main_key_RevLineCr_label,main_key_State,main_key_City,DisbursementGross_main_key_NoEmp_label_med,DisbursementGross_main_key_NoEmp_label_diff,GrAppv_main_key_NoEmp_label_med,GrAppv_main_key_NoEmp_label_diff,SBA_Appv_main_key_NoEmp_label_med,SBA_Appv_main_key_NoEmp_label_diff,CreateJob_main_key_NoEmp_label_med,CreateJob_main_key_NoEmp_label_diff,RetainedJob_main_key_NoEmp_label_med,RetainedJob_main_key_NoEmp_label_diff,NoEmp_main_key_NoEmp_label_med,NoEmp_main_key_NoEmp_label_diff,Term_main_key_NoEmp_label_med,Term_main_key_NoEmp_label_diff,DisbursementGross_main_key_LowDoc_label_med,DisbursementGross_main_key_LowDoc_label_diff,GrAppv_main_key_LowDoc_label_med,GrAppv_main_key_LowDoc_label_diff,SBA_Appv_main_key_LowDoc_label_med,SBA_Appv_main_key_LowDoc_label_diff,CreateJob_main_key_LowDoc_label_med,CreateJob_main_key_LowDoc_label_diff,RetainedJob_main_key_LowDoc_label_med,RetainedJob_main_key_LowDoc_label_diff,NoEmp_main_key_LowDoc_label_med,NoEmp_main_key_LowDoc_label_diff,Term_main_key_LowDoc_label_med,Term_main_key_LowDoc_label_diff,DisbursementGross_main_key_RevLineCr_label_med,DisbursementGross_main_key_RevLineCr_label_diff,GrAppv_main_key_RevLineCr_label_med,GrAppv_main_key_RevLineCr_label_diff,SBA_Appv_main_key_RevLineCr_label_med,SBA_Appv_main_key_RevLineCr_label_diff,CreateJob_main_key_RevLineCr_label_med,CreateJob_main_key_RevLineCr_label_diff,RetainedJob_main_key_RevLineCr_label_med,RetainedJob_main_key_RevLineCr_label_diff,NoEmp_main_key_RevLineCr_label_med,NoEmp_main_key_RevLineCr_label_diff,Term_main_key_RevLineCr_label_med,Term_main_key_RevLineCr_label_diff,DisbursementGross_main_key_State_med,DisbursementGross_main_key_State_diff,GrAppv_main_key_State_med,GrAppv_main_key_State_diff,SBA_Appv_main_key_State_med,SBA_Appv_main_key_State_diff,CreateJob_main_key_State_med,CreateJob_main_key_State_diff,RetainedJob_main_key_State_med,RetainedJob_main_key_State_diff,NoEmp_main_key_State_med,NoEmp_main_key_State_diff,Term_main_key_State_med,Term_main_key_State_diff,DisbursementGross_main_key_City_med,DisbursementGross_main_key_City_diff,GrAppv_main_key_City_med,GrAppv_main_key_City_diff,SBA_Appv_main_key_City_med,SBA_Appv_main_key_City_diff,CreateJob_main_key_City_med,CreateJob_main_key_City_diff,RetainedJob_main_key_City_med,RetainedJob_main_key_City_diff,NoEmp_main_key_City_med,NoEmp_main_key_City_diff,Term_main_key_City_med,Term_main_key_City_diff
0,0,163,21,1.0,0,0,1,N,N,1998-01-31,1.0,0,2006-09-22,2006,PHOENIX,AZ,SD,80000,80000,68000,0,1,1,3,2208,3,42,4,4_0,3_0,1_0,AZ_0,PHOENIX_0,100000.0,-20000.0,100000.0,-20000.0,80000.0,-12000.0,0.0,0.0,0.0,0.0,19.0,2.0,160.0,3.0,100000.0,-20000.0,100000.0,-20000.0,76500.0,-8500.0,0.0,0.0,0.0,0.0,4.0,17.0,82.0,81.0,100000.0,-20000.0,100000.0,-20000.0,79200.0,-11200.0,0.0,0.0,0.0,0.0,4.0,17.0,83.0,80.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,6.0,15.0,119.0,44.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,10.0,11.0,163.0,0.0
1,1,84,6,1.0,4,0,0,0,N,1993-10-31,1.0,62,1992-06-30,1992,MCALESTER,OK,OK,287000,287000,229600,0,1,0,3,1723,36,36,3,3_62,3_62,0_62,XX,XX,120000.0,167000.0,100000.0,187000.0,76500.0,153100.0,0.0,4.0,0.0,0.0,6.0,0.0,83.0,1.0,140000.0,147000.0,129000.0,158000.0,106250.0,123350.0,0.0,4.0,0.0,0.0,5.0,1.0,83.0,1.0,85000.0,202000.0,80000.0,207000.0,63000.0,166600.0,0.0,4.0,0.0,0.0,3.0,3.0,82.0,2.0,99000.0,188000.0,83000.0,204000.0,57600.0,172000.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0,99000.0,188000.0,84000.0,203000.0,58500.0,171100.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0
2,2,242,45,1.0,4,90,0,N,N,2001-08-31,1.0,42,2001-04-18,2001,HAWTHORNE,NJ,NJ,31983,30000,15000,1,1,1,3,1214,31,31,4,4_42,3_42,1_42,NJ_42,XX,116664.0,-84681.0,91000.0,-61000.0,58100.0,-43100.0,0.0,4.0,0.0,90.0,18.0,27.0,85.0,157.0,100000.0,-68017.0,82500.0,-52500.0,51000.0,-36000.0,0.0,4.0,0.0,90.0,5.0,40.0,82.0,160.0,113965.5,-81982.5,100000.0,-70000.0,72650.0,-57650.0,0.0,4.0,0.0,90.0,6.0,39.0,83.0,159.0,115000.0,-83017.0,100000.0,-70000.0,60350.0,-45350.0,0.0,4.0,0.0,90.0,13.0,32.0,88.0,154.0,99000.0,-67017.0,84000.0,-54000.0,58500.0,-43500.0,0.0,4.0,0.0,90.0,4.0,41.0,82.0,160.0
3,3,237,4,1.0,0,0,0,N,N,2007-08-31,1.0,33,2003-10-06,2004,NASHVILLE,TN,SD,229000,229000,229000,0,1,1,3,1906,42,42,2,2_33,3_33,1_33,TN_33,NASHVILLE_33,98000.0,131000.0,83000.0,146000.0,55800.0,173200.0,0.0,0.0,1.0,-1.0,4.0,0.0,75.0,162.0,92500.0,136500.0,75000.0,154000.0,50000.0,179000.0,0.0,0.0,1.0,-1.0,5.0,-1.0,82.0,155.0,105596.0,123404.0,95000.0,134000.0,62700.0,166300.0,0.0,0.0,0.0,0.0,6.0,-2.0,83.0,154.0,121000.0,108000.0,121000.0,108000.0,88500.0,140500.0,0.0,0.0,1.0,-1.0,6.0,-2.0,82.0,155.0,154202.5,74797.5,139000.0,90000.0,115500.0,113500.0,0.0,0.0,0.0,0.0,6.0,-2.0,84.0,153.0
4,4,184,0,1.0,0,0,0,N,N,1983-06-08,1.0,0,1999-12-17,2000,POMONA,CA,CA,525000,525000,393750,0,1,1,3,2246,4,4,1,1_0,3_0,1_0,CA_0,POMONA_0,80000.0,445000.0,80000.0,445000.0,68000.0,325750.0,0.0,0.0,0.0,0.0,1.0,-1.0,63.0,121.0,100000.0,425000.0,100000.0,425000.0,76500.0,317250.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,102.0,100000.0,425000.0,100000.0,425000.0,79200.0,314550.0,0.0,0.0,0.0,0.0,4.0,-4.0,83.0,101.0,110000.0,415000.0,110000.0,415000.0,90000.0,303750.0,0.0,0.0,0.0,0.0,4.0,-4.0,79.0,105.0,243000.0,282000.0,243000.0,282000.0,183750.0,210000.0,0.0,0.0,0.0,0.0,2.0,-2.0,61.0,123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84610,84610,243,10,1.0,3,14,0,N,N,2012-12-01,,42,2012-04-23,2012,FT. WRIGHT,KY,OH,390000,150000,127500,0,0,1,3,1022,17,35,3,3_42,3_42,1_42,XX,XX,104840.0,285160.0,92000.0,58000.0,62000.0,65500.0,0.0,3.0,0.0,14.0,6.0,4.0,82.0,161.0,100000.0,290000.0,82500.0,67500.0,51000.0,76500.0,0.0,3.0,0.0,14.0,5.0,5.0,82.0,161.0,113965.5,276034.5,100000.0,50000.0,72650.0,54850.0,0.0,3.0,0.0,14.0,6.0,4.0,83.0,160.0,99000.0,291000.0,83000.0,67000.0,57600.0,69900.0,0.0,3.0,0.0,14.0,4.0,6.0,82.0,161.0,99000.0,291000.0,84000.0,66000.0,58500.0,69000.0,0.0,3.0,0.0,14.0,4.0,6.0,82.0,161.0
84611,84611,178,0,2.0,0,0,1,N,N,2003-11-30,,0,2006-10-27,2007,PHILADELPHIA,PA,RI,100000,100000,90000,0,0,1,3,2207,38,40,1,1_0,3_0,1_0,PA_0,PHILADELPHIA_0,80000.0,20000.0,80000.0,20000.0,68000.0,22000.0,0.0,0.0,0.0,0.0,1.0,-1.0,63.0,115.0,100000.0,0.0,100000.0,0.0,76500.0,13500.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,96.0,100000.0,0.0,100000.0,0.0,79200.0,10800.0,0.0,0.0,0.0,0.0,4.0,-4.0,83.0,95.0,80000.0,20000.0,80000.0,20000.0,64000.0,26000.0,0.0,0.0,0.0,0.0,4.0,-4.0,84.0,94.0,80000.0,20000.0,80000.0,20000.0,68000.0,22000.0,0.0,0.0,0.0,0.0,5.0,-5.0,166.0,12.0
84612,84612,42,1,2.0,3,9,0,Y,N,2009-02-28,,33,1989-09-21,1989,ELMHURST,IL,IL,17000,17000,13600,0,0,3,3,858,14,14,1,1_33,3_33,3_33,IL_33,XX,50000.0,-33000.0,50000.0,-33000.0,25000.0,-11400.0,0.0,3.0,1.0,8.0,1.0,0.0,63.0,-21.0,92500.0,-75500.0,75000.0,-58000.0,50000.0,-36400.0,0.0,3.0,1.0,8.0,5.0,-4.0,82.0,-40.0,50000.0,-33000.0,50000.0,-33000.0,25000.0,-11400.0,1.0,2.0,3.0,6.0,2.0,-1.0,64.0,-22.0,105635.0,-88635.0,100000.0,-83000.0,51000.0,-37400.0,0.0,3.0,1.0,8.0,6.0,-5.0,82.0,-40.0,99000.0,-82000.0,84000.0,-67000.0,58500.0,-44900.0,0.0,3.0,0.0,9.0,4.0,-3.0,82.0,-40.0
84613,84613,76,15,1.0,0,0,0,N,N,2008-01-31,,0,2006-04-03,2006,NASHVILLE,TN,TN,7500,7500,6375,0,0,1,3,1906,42,43,4,4_0,3_0,1_0,TN_0,NASHVILLE_0,100000.0,-92500.0,100000.0,-92500.0,80000.0,-73625.0,0.0,0.0,0.0,0.0,19.0,-4.0,160.0,-84.0,100000.0,-92500.0,100000.0,-92500.0,76500.0,-70125.0,0.0,0.0,0.0,0.0,4.0,11.0,82.0,-6.0,100000.0,-92500.0,100000.0,-92500.0,79200.0,-72825.0,0.0,0.0,0.0,0.0,4.0,11.0,83.0,-7.0,125000.0,-117500.0,125000.0,-117500.0,109250.0,-102875.0,0.0,0.0,0.0,0.0,5.0,10.0,166.0,-90.0,275000.0,-267500.0,275000.0,-267500.0,206250.0,-199875.0,0.0,0.0,0.0,0.0,4.0,11.0,120.0,-44.0


In [167]:
train_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,RevLineCr_label,LowDoc_label,City_label,State_label,BankState_label,NoEmp_label,main_key_NoEmp_label,main_key_LowDoc_label,main_key_RevLineCr_label,main_key_State,main_key_City,DisbursementGross_main_key_NoEmp_label_med,DisbursementGross_main_key_NoEmp_label_diff,GrAppv_main_key_NoEmp_label_med,GrAppv_main_key_NoEmp_label_diff,SBA_Appv_main_key_NoEmp_label_med,SBA_Appv_main_key_NoEmp_label_diff,CreateJob_main_key_NoEmp_label_med,CreateJob_main_key_NoEmp_label_diff,RetainedJob_main_key_NoEmp_label_med,RetainedJob_main_key_NoEmp_label_diff,NoEmp_main_key_NoEmp_label_med,NoEmp_main_key_NoEmp_label_diff,Term_main_key_NoEmp_label_med,Term_main_key_NoEmp_label_diff,DisbursementGross_main_key_LowDoc_label_med,DisbursementGross_main_key_LowDoc_label_diff,GrAppv_main_key_LowDoc_label_med,GrAppv_main_key_LowDoc_label_diff,SBA_Appv_main_key_LowDoc_label_med,SBA_Appv_main_key_LowDoc_label_diff,CreateJob_main_key_LowDoc_label_med,CreateJob_main_key_LowDoc_label_diff,RetainedJob_main_key_LowDoc_label_med,RetainedJob_main_key_LowDoc_label_diff,NoEmp_main_key_LowDoc_label_med,NoEmp_main_key_LowDoc_label_diff,Term_main_key_LowDoc_label_med,Term_main_key_LowDoc_label_diff,DisbursementGross_main_key_RevLineCr_label_med,DisbursementGross_main_key_RevLineCr_label_diff,GrAppv_main_key_RevLineCr_label_med,GrAppv_main_key_RevLineCr_label_diff,SBA_Appv_main_key_RevLineCr_label_med,SBA_Appv_main_key_RevLineCr_label_diff,CreateJob_main_key_RevLineCr_label_med,CreateJob_main_key_RevLineCr_label_diff,RetainedJob_main_key_RevLineCr_label_med,RetainedJob_main_key_RevLineCr_label_diff,NoEmp_main_key_RevLineCr_label_med,NoEmp_main_key_RevLineCr_label_diff,Term_main_key_RevLineCr_label_med,Term_main_key_RevLineCr_label_diff,DisbursementGross_main_key_State_med,DisbursementGross_main_key_State_diff,GrAppv_main_key_State_med,GrAppv_main_key_State_diff,SBA_Appv_main_key_State_med,SBA_Appv_main_key_State_diff,CreateJob_main_key_State_med,CreateJob_main_key_State_diff,RetainedJob_main_key_State_med,RetainedJob_main_key_State_diff,NoEmp_main_key_State_med,NoEmp_main_key_State_diff,Term_main_key_State_med,Term_main_key_State_diff,DisbursementGross_main_key_City_med,DisbursementGross_main_key_City_diff,GrAppv_main_key_City_med,GrAppv_main_key_City_diff,SBA_Appv_main_key_City_med,SBA_Appv_main_key_City_diff,CreateJob_main_key_City_med,CreateJob_main_key_City_diff,RetainedJob_main_key_City_med,RetainedJob_main_key_City_diff,NoEmp_main_key_City_med,NoEmp_main_key_City_diff,Term_main_key_City_med,Term_main_key_City_diff
0,0,163,21,1.0,0,0,1,N,N,1998-01-31,1,0,2006-09-22,2006,PHOENIX,AZ,SD,80000,80000,68000,0,1,3,1857,3,41,4,4_0,3_0,1_0,AZ_0,PHOENIX_0,100000.0,-20000.0,100000.0,-20000.0,82500.0,-14500.0,0.0,0.0,0.0,0.0,19.0,2.0,163.0,0.0,100000.0,-20000.0,100000.0,-20000.0,75000.0,-7000.0,0.0,0.0,0.0,0.0,4.0,17.0,82.0,81.0,100000.0,-20000.0,100000.0,-20000.0,80000.0,-12000.0,0.0,0.0,0.0,0.0,4.0,17.0,83.0,80.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,7.0,14.0,158.0,5.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,10.0,11.0,163.0,0.0
1,1,84,6,1.0,4,0,0,0,N,1993-10-31,1,62,1992-06-30,1992,MCALESTER,OK,OK,287000,287000,229600,0,0,3,1451,36,36,3,3_62,3_62,0_62,XX,XX,121000.0,166000.0,100000.0,187000.0,80000.0,149600.0,0.0,4.0,0.0,0.0,6.0,0.0,83.0,1.0,138300.0,148700.0,129000.0,158000.0,105335.0,124265.0,0.0,4.0,0.0,0.0,5.0,1.0,83.0,1.0,100000.0,187000.0,96033.0,190967.0,64000.0,165600.0,0.0,4.0,0.0,0.0,3.0,3.0,82.0,2.0,99000.0,188000.0,82500.0,204500.0,56270.0,173330.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0,100000.0,187000.0,85000.0,202000.0,59500.0,170100.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0
2,2,242,45,1.0,4,90,0,N,N,2001-08-31,1,42,2001-04-18,2001,HAWTHORNE,NJ,NJ,31983,30000,15000,1,1,3,1021,31,31,4,4_42,3_42,1_42,NJ_42,XX,121050.0,-89067.0,100000.0,-70000.0,65600.0,-50600.0,0.0,4.0,0.0,90.0,18.0,27.0,85.5,156.5,100000.0,-68017.0,86750.0,-56750.0,52600.0,-37600.0,0.0,4.0,0.0,90.0,5.0,40.0,82.0,160.0,120000.0,-88017.0,100000.0,-70000.0,75000.0,-60000.0,0.0,4.0,0.0,90.0,6.0,39.0,83.0,159.0,110000.0,-78017.0,100000.0,-70000.0,76500.0,-61500.0,0.0,4.0,0.0,90.0,12.0,33.0,119.0,123.0,100000.0,-68017.0,85000.0,-55000.0,59500.0,-44500.0,0.0,4.0,0.0,90.0,4.0,41.0,82.0,160.0
3,3,237,4,1.0,0,0,0,N,N,2007-08-31,1,33,2003-10-06,2004,NASHVILLE,TN,SD,229000,229000,229000,0,1,3,1610,42,41,2,2_33,3_33,1_33,TN_33,NASHVILLE_33,98000.0,131000.0,82050.0,146950.0,56000.0,173000.0,0.0,0.0,1.0,-1.0,4.0,0.0,73.0,164.0,94203.0,134797.0,75000.0,154000.0,50000.0,179000.0,0.0,0.0,1.0,-1.0,5.0,-1.0,82.0,155.0,107909.0,121091.0,100000.0,129000.0,64000.0,165000.0,0.0,0.0,0.0,0.0,6.0,-2.0,83.0,154.0,118000.0,111000.0,111600.0,117400.0,77925.0,151075.0,0.0,0.0,1.0,-1.0,5.5,-1.5,82.0,155.0,158300.0,70700.0,168300.0,60700.0,132000.0,97000.0,0.0,0.0,0.0,0.0,6.0,-2.0,83.0,154.0
4,4,184,0,1.0,0,0,0,N,N,1983-06-08,1,0,1999-12-17,2000,POMONA,CA,CA,525000,525000,393750,0,1,3,1893,4,4,1,1_0,3_0,1_0,CA_0,POMONA_0,80000.0,445000.0,80000.0,445000.0,68000.0,325750.0,0.0,0.0,0.0,0.0,1.0,-1.0,67.0,117.0,100000.0,425000.0,100000.0,425000.0,75000.0,318750.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,102.0,100000.0,425000.0,100000.0,425000.0,80000.0,313750.0,0.0,0.0,0.0,0.0,4.0,-4.0,83.0,101.0,110000.0,415000.0,110000.0,415000.0,90000.0,303750.0,0.0,0.0,0.0,0.0,4.0,-4.0,81.0,103.0,275000.0,250000.0,275000.0,250000.0,243000.0,150750.0,0.0,0.0,0.0,0.0,3.0,-3.0,61.0,123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,42302,283,14,1.0,0,0,1,N,N,1998-01-31,1,0,1995-03-02,1995,PHILADELPHIA,PA,PA,80000,80000,68000,0,1,3,1856,38,38,4,4_0,3_0,1_0,PA_0,PHILADELPHIA_0,100000.0,-20000.0,100000.0,-20000.0,82500.0,-14500.0,0.0,0.0,0.0,0.0,19.0,-5.0,163.0,120.0,100000.0,-20000.0,100000.0,-20000.0,75000.0,-7000.0,0.0,0.0,0.0,0.0,4.0,10.0,82.0,201.0,100000.0,-20000.0,100000.0,-20000.0,80000.0,-12000.0,0.0,0.0,0.0,0.0,4.0,10.0,83.0,200.0,80000.0,0.0,80000.0,0.0,64000.0,4000.0,0.0,0.0,0.0,0.0,4.0,10.0,112.5,170.5,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0,167.0,116.0
42303,42303,53,2,1.0,0,0,0,Y,N,1991-04-03,1,42,2007-06-06,2007,LOS ANGELES,CA,SD,5000,5000,4250,1,3,3,1349,4,41,1,1_42,3_42,3_42,CA_42,LOS ANGELES_42,61937.0,-56937.0,60000.0,-55000.0,50000.0,-45750.0,0.0,0.0,1.0,-1.0,1.0,1.0,81.0,-28.0,100000.0,-95000.0,86750.0,-81750.0,52600.0,-48350.0,0.0,0.0,0.0,0.0,5.0,-3.0,82.0,-29.0,50000.0,-45000.0,50000.0,-45000.0,25000.0,-20750.0,1.0,-1.0,2.0,-2.0,2.0,0.0,81.0,-28.0,80809.5,-75809.5,70000.0,-65000.0,48400.0,-44150.0,0.0,0.0,0.0,0.0,4.0,-2.0,81.0,-28.0,25000.0,-20000.0,25000.0,-20000.0,12500.0,-8250.0,8.0,-8.0,9.0,-9.0,2.0,0.0,80.5,-27.5
42304,42304,59,6,2.0,0,0,1,N,N,2003-02-28,1,42,2003-03-14,2003,COLUMBUS,OH,OH,60000,60000,51000,0,1,3,485,35,35,3,3_42,3_42,1_42,OH_42,XX,111500.0,-51500.0,99000.0,-39000.0,65600.0,-14600.0,0.0,0.0,0.0,0.0,6.0,0.0,82.0,-23.0,100000.0,-40000.0,86750.0,-26750.0,52600.0,-1600.0,0.0,0.0,0.0,0.0,5.0,1.0,82.0,-23.0,120000.0,-60000.0,100000.0,-40000.0,75000.0,-24000.0,0.0,0.0,0.0,0.0,6.0,0.0,83.0,-24.0,145000.0,-85000.0,131000.0,-71000.0,100000.0,-49000.0,0.0,0.0,0.0,0.0,5.0,1.0,83.0,-24.0,100000.0,-40000.0,85000.0,-25000.0,59500.0,-8500.0,0.0,0.0,0.0,0.0,4.0,2.0,82.0,-23.0
42305,42305,295,18,1.0,0,8,0,N,N,1997-12-10,1,42,1989-08-23,1989,CLOQUET,MN,MN,294000,294000,220500,0,1,3,458,23,23,4,4_42,3_42,1_42,MN_42,XX,121050.0,172950.0,100000.0,194000.0,65600.0,154900.0,0.0,0.0,0.0,8.0,18.0,0.0,85.5,209.5,100000.0,194000.0,86750.0,207250.0,52600.0,167900.0,0.0,0.0,0.0,8.0,5.0,13.0,82.0,213.0,120000.0,174000.0,100000.0,194000.0,75000.0,145500.0,0.0,0.0,0.0,8.0,6.0,12.0,83.0,212.0,130500.0,163500.0,105000.0,189000.0,91387.5,129112.5,0.0,0.0,0.0,8.0,5.5,12.5,83.0,212.0,100000.0,194000.0,85000.0,209000.0,59500.0,161000.0,0.0,0.0,0.0,8.0,4.0,14.0,82.0,213.0


In [168]:
test_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,RevLineCr_label,LowDoc_label,City_label,State_label,BankState_label,NoEmp_label,main_key_NoEmp_label,main_key_LowDoc_label,main_key_RevLineCr_label,main_key_State,main_key_City,DisbursementGross_main_key_NoEmp_label_med,DisbursementGross_main_key_NoEmp_label_diff,GrAppv_main_key_NoEmp_label_med,GrAppv_main_key_NoEmp_label_diff,SBA_Appv_main_key_NoEmp_label_med,SBA_Appv_main_key_NoEmp_label_diff,CreateJob_main_key_NoEmp_label_med,CreateJob_main_key_NoEmp_label_diff,RetainedJob_main_key_NoEmp_label_med,RetainedJob_main_key_NoEmp_label_diff,NoEmp_main_key_NoEmp_label_med,NoEmp_main_key_NoEmp_label_diff,Term_main_key_NoEmp_label_med,Term_main_key_NoEmp_label_diff,DisbursementGross_main_key_LowDoc_label_med,DisbursementGross_main_key_LowDoc_label_diff,GrAppv_main_key_LowDoc_label_med,GrAppv_main_key_LowDoc_label_diff,SBA_Appv_main_key_LowDoc_label_med,SBA_Appv_main_key_LowDoc_label_diff,CreateJob_main_key_LowDoc_label_med,CreateJob_main_key_LowDoc_label_diff,RetainedJob_main_key_LowDoc_label_med,RetainedJob_main_key_LowDoc_label_diff,NoEmp_main_key_LowDoc_label_med,NoEmp_main_key_LowDoc_label_diff,Term_main_key_LowDoc_label_med,Term_main_key_LowDoc_label_diff,DisbursementGross_main_key_RevLineCr_label_med,DisbursementGross_main_key_RevLineCr_label_diff,GrAppv_main_key_RevLineCr_label_med,GrAppv_main_key_RevLineCr_label_diff,SBA_Appv_main_key_RevLineCr_label_med,SBA_Appv_main_key_RevLineCr_label_diff,CreateJob_main_key_RevLineCr_label_med,CreateJob_main_key_RevLineCr_label_diff,RetainedJob_main_key_RevLineCr_label_med,RetainedJob_main_key_RevLineCr_label_diff,NoEmp_main_key_RevLineCr_label_med,NoEmp_main_key_RevLineCr_label_diff,Term_main_key_RevLineCr_label_med,Term_main_key_RevLineCr_label_diff,DisbursementGross_main_key_State_med,DisbursementGross_main_key_State_diff,GrAppv_main_key_State_med,GrAppv_main_key_State_diff,SBA_Appv_main_key_State_med,SBA_Appv_main_key_State_diff,CreateJob_main_key_State_med,CreateJob_main_key_State_diff,RetainedJob_main_key_State_med,RetainedJob_main_key_State_diff,NoEmp_main_key_State_med,NoEmp_main_key_State_diff,Term_main_key_State_med,Term_main_key_State_diff,DisbursementGross_main_key_City_med,DisbursementGross_main_key_City_diff,GrAppv_main_key_City_med,GrAppv_main_key_City_diff,SBA_Appv_main_key_City_med,SBA_Appv_main_key_City_diff,CreateJob_main_key_City_med,CreateJob_main_key_City_diff,RetainedJob_main_key_City_med,RetainedJob_main_key_City_diff,NoEmp_main_key_City_med,NoEmp_main_key_City_diff,Term_main_key_City_med,Term_main_key_City_diff
0,42307,5,2,1.0,1,0,0,T,N,2004-07-31,23,2007-08-06,2007,SUNNYVALE,CA,CA,25000,25000,21250,1,2,3,2308,4,4,1,1_23,3_23,2_23,CA_23,XX,65000.0,-40000.0,60000.0,-35000.0,40705.0,-19455.0,0.0,1.0,1.0,-1.0,1.0,1.0,81.0,-76.0,80000.0,-55000.0,65000.0,-40000.0,48000.0,-26750.0,0.0,1.0,1.0,-1.0,4.0,-2.0,81.0,-76.0,75000.0,-50000.0,75000.0,-50000.0,48000.0,-26750.0,1.0,0.0,1.0,-1.0,2.0,0.0,56.0,-51.0,70000.0,-45000.0,62200.0,-37200.0,43375.0,-22125.0,0.0,1.0,0.0,0.0,3.0,-1.0,81.0,-76.0,98500.0,-73500.0,83000.0,-58000.0,57600.0,-36350.0,0.0,1.0,0.0,0.0,4.0,-2.0,82.0,-77.0
1,42308,235,13,1.0,9,14,77725,Y,N,1995-01-11,44,2004-03-08,2004,PITTSBURGH,PA,PA,15000,15000,7500,0,3,3,1862,38,38,4,4_44,3_44,3_44,PA_44,XX,130150.0,-115150.0,108100.0,-93100.0,84480.0,-76980.0,0.0,9.0,0.0,14.0,18.0,-5.0,92.5,142.5,100000.0,-85000.0,90000.0,-75000.0,51000.0,-43500.0,0.0,9.0,1.0,13.0,4.0,9.0,81.0,154.0,50000.0,-35000.0,50000.0,-35000.0,25000.0,-17500.0,1.0,8.0,3.0,11.0,2.0,11.0,75.5,159.5,120600.0,-105600.0,100900.0,-85900.0,76000.0,-68500.0,0.0,9.0,0.0,14.0,4.0,9.0,70.0,165.0,98500.0,-83500.0,83000.0,-68000.0,57600.0,-50100.0,0.0,9.0,0.0,14.0,4.0,9.0,82.0,153.0
2,42309,31,5,2.0,0,0,0,N,,NaT,56,2007-02-27,2007,LITTLE ROCK,AR,AR,28000,28000,23800,1,1,6,1311,2,2,3,3_56,6_56,1_56,XX,XX,119300.0,-91300.0,100000.0,-72000.0,83678.0,-59878.0,0.0,0.0,0.0,0.0,6.0,-1.0,80.5,-49.5,75000.0,-47000.0,75000.0,-47000.0,56250.0,-32450.0,0.0,0.0,0.0,0.0,1.0,4.0,82.0,-51.0,100000.0,-72000.0,80000.0,-52000.0,54400.0,-30600.0,0.0,0.0,0.0,0.0,5.0,0.0,82.0,-51.0,96200.0,-68200.0,80000.0,-52000.0,56250.0,-32450.0,0.0,0.0,0.0,0.0,4.0,1.0,82.0,-51.0,98500.0,-70500.0,83000.0,-55000.0,57600.0,-33800.0,0.0,0.0,0.0,0.0,4.0,1.0,82.0,-51.0
3,42310,120,4,1.0,0,1,0,Y,N,1999-04-30,62,1997-12-19,1998,LITTLE ROCK,AR,AR,7500,7500,6375,2,3,3,1311,2,2,2,2_62,3_62,3_62,XX,XX,100000.0,-92500.0,98500.0,-91000.0,64000.0,-57625.0,0.0,0.0,0.0,1.0,3.0,1.0,82.0,38.0,143500.0,-136000.0,129000.0,-121500.0,106250.0,-99875.0,0.0,0.0,0.0,1.0,5.0,-1.0,83.0,37.0,120600.0,-113100.0,100000.0,-92500.0,75000.0,-68625.0,0.0,0.0,2.0,-1.0,2.0,2.0,64.0,56.0,96200.0,-88700.0,80000.0,-72500.0,56250.0,-49875.0,0.0,0.0,0.0,1.0,4.0,0.0,82.0,38.0,98500.0,-91000.0,83000.0,-75500.0,57600.0,-51225.0,0.0,0.0,0.0,1.0,4.0,0.0,82.0,38.0
4,42311,63,13,1.0,0,8,1,N,N,2005-12-31,42,2009-07-10,2009,Louisville,KY,DE,91000,93000,93000,0,1,3,1374,17,8,4,4_42,3_42,1_42,XX,XX,111600.0,-20600.0,84500.0,8500.0,54900.0,38100.0,0.0,0.0,0.0,8.0,18.0,-5.0,85.0,-22.0,96952.5,-5952.5,76450.0,16550.0,51000.0,42000.0,0.0,0.0,1.0,7.0,5.0,8.0,82.0,-19.0,110000.0,-19000.0,100000.0,-7000.0,72000.0,21000.0,0.0,0.0,0.0,8.0,6.0,7.0,83.0,-20.0,96200.0,-5200.0,80000.0,13000.0,56250.0,36750.0,0.0,0.0,0.0,8.0,4.0,9.0,82.0,-19.0,98500.0,-7500.0,83000.0,10000.0,57600.0,35400.0,0.0,0.0,0.0,8.0,4.0,9.0,82.0,-19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42303,84610,243,10,1.0,3,14,0,N,N,2012-12-01,42,2012-04-23,2012,FT. WRIGHT,KY,OH,390000,150000,127500,0,1,3,850,17,35,3,3_42,3_42,1_42,XX,XX,100000.0,290000.0,88320.0,61680.0,58320.0,69180.0,0.0,3.0,0.0,14.0,6.0,4.0,82.0,161.0,96952.5,293047.5,76450.0,73550.0,51000.0,76500.0,0.0,3.0,1.0,13.0,5.0,5.0,82.0,161.0,110000.0,280000.0,100000.0,50000.0,72000.0,55500.0,0.0,3.0,0.0,14.0,6.0,4.0,83.0,160.0,96200.0,293800.0,80000.0,70000.0,56250.0,71250.0,0.0,3.0,0.0,14.0,4.0,6.0,82.0,161.0,98500.0,291500.0,83000.0,67000.0,57600.0,69900.0,0.0,3.0,0.0,14.0,4.0,6.0,82.0,161.0
42304,84611,178,0,2.0,0,0,1,N,N,2003-11-30,0,2006-10-27,2007,PHILADELPHIA,PA,RI,100000,100000,90000,0,1,3,1848,38,40,1,1_0,3_0,1_0,PA_0,PHILADELPHIA_0,80000.0,20000.0,80000.0,20000.0,68000.0,22000.0,0.0,0.0,0.0,0.0,1.0,-1.0,62.0,116.0,100000.0,0.0,100000.0,0.0,80000.0,10000.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,96.0,100000.0,0.0,100000.0,0.0,76500.0,13500.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,96.0,75000.0,25000.0,80000.0,20000.0,64000.0,26000.0,0.0,0.0,0.0,0.0,4.0,-4.0,83.0,95.0,80000.0,20000.0,80000.0,20000.0,68000.0,22000.0,0.0,0.0,0.0,0.0,5.0,-5.0,164.0,14.0
42305,84612,42,1,2.0,3,9,0,Y,N,2009-02-28,33,1989-09-21,1989,ELMHURST,IL,IL,17000,17000,13600,0,3,3,712,14,14,1,1_33,3_33,3_33,IL_33,XX,50000.0,-33000.0,50000.0,-33000.0,25000.0,-11400.0,0.0,3.0,1.0,8.0,1.0,0.0,62.0,-20.0,90962.0,-73962.0,75000.0,-58000.0,50000.0,-36400.0,0.0,3.0,1.0,8.0,5.0,-4.0,82.0,-40.0,50000.0,-33000.0,50000.0,-33000.0,25000.0,-11400.0,1.0,2.0,3.0,6.0,2.0,-1.0,64.0,-22.0,105000.0,-88000.0,98066.0,-81066.0,51000.0,-37400.0,0.0,3.0,1.0,8.0,6.0,-5.0,82.0,-40.0,98500.0,-81500.0,83000.0,-66000.0,57600.0,-44000.0,0.0,3.0,0.0,9.0,4.0,-3.0,82.0,-40.0
42306,84613,76,15,1.0,0,0,0,N,N,2008-01-31,0,2006-04-03,2006,NASHVILLE,TN,TN,7500,7500,6375,0,1,3,1599,42,43,4,4_0,3_0,1_0,TN_0,NASHVILLE_0,100000.0,-92500.0,100000.0,-92500.0,80000.0,-73625.0,0.0,0.0,0.0,0.0,19.0,-4.0,121.0,-45.0,100000.0,-92500.0,100000.0,-92500.0,80000.0,-73625.0,0.0,0.0,0.0,0.0,4.0,11.0,82.0,-6.0,100000.0,-92500.0,100000.0,-92500.0,76500.0,-70125.0,0.0,0.0,0.0,0.0,4.0,11.0,82.0,-6.0,180000.0,-172500.0,180000.0,-172500.0,135000.0,-128625.0,0.0,0.0,0.0,0.0,5.0,10.0,121.0,-45.0,275000.0,-267500.0,275000.0,-267500.0,206250.0,-199875.0,0.0,0.0,0.0,0.0,4.0,11.0,120.0,-44.0


In [169]:
def create_svd(col,train_df,test_df,feature):
    svd=TruncatedSVD(n_components=len(col)//5,random_state=1)
    tmp_train_df=train_df[col]
    tmp_test_df=test_df[col]
#     df=pd.concat([tmp_train_df,tmp_test_df])
    df=tmp_train_df.copy()
    ss=StandardScaler()
    df[col]=ss.fit_transform(df[col])
    tmp_train_df[col]=ss.transform(tmp_train_df[col])
    tmp_test_df[col]=ss.transform(tmp_test_df[col])
    svd.fit(df)
    col_name='svd'
    for c in col:
        c=f'{c}'
        col_name+=c
    tmp_train_df=svd.transform(tmp_train_df)
    tmp_test_df=svd.transform(tmp_test_df)

    tmp_train_df=pd.DataFrame(tmp_train_df)
    tmp_test_df=pd.DataFrame(tmp_test_df)
    tmp_train_df = tmp_train_df.add_prefix(f'svd_{feature}_')
    tmp_test_df = tmp_test_df.add_prefix(f'svd_{feature}_')
    train_df=pd.concat([train_df,tmp_train_df],axis=1)
    test_df=pd.concat([test_df,tmp_test_df],axis=1)
    return train_df,test_df
def create_nmf(col,train_df,test_df,feature):
    nmf=NMF(n_components=len(col)//5,random_state=1)
#     nmf=NMF(n_components=s,random_state=1)
    tmp_train_df=train_df[col]
    tmp_test_df=test_df[col]
#     df=pd.concat([tmp_train_df,tmp_test_df])
    df=tmp_train_df.copy()
    mm=MinMaxScaler(feature_range=(0, 100),clip=True)
    ss=StandardScaler()
    df[col]=ss.fit_transform(df[col])
    tmp_train_df[col]=ss.transform(tmp_train_df[col])
    tmp_test_df[col]=ss.transform(tmp_test_df[col])
    df[col]=mm.fit_transform(df[col])
    tmp_train_df[col]=mm.transform(tmp_train_df[col])
    tmp_test_df[col]=mm.transform(tmp_test_df[col])
    nmf.fit(df)
    col_name='nmf'
    for c in col:
        c=f'{c}'
        col_name+=c
    tmp_train_df=nmf.transform(tmp_train_df)
    tmp_test_df=nmf.transform(tmp_test_df)
    
    tmp_train_df=pd.DataFrame(tmp_train_df)
    tmp_test_df=pd.DataFrame(tmp_test_df)
    tmp_train_df = tmp_train_df.add_prefix(f'nmf_{feature}_')
    tmp_test_df = tmp_test_df.add_prefix(f'nmf_{feature}_')
    train_df=pd.concat([train_df,tmp_train_df],axis=1)
    test_df=pd.concat([test_df,tmp_test_df],axis=1)
    return train_df,test_df

In [217]:
from gensim.models import Word2Vec

In [242]:
from gensim.models import KeyedVectors

In [218]:
# # word 2 vec 次元圧縮
# w2v = Word2Vec.load(os.path.join("../output/", "word2vec.gensim.model"))
# # word 2 vec 類似語検索
# w2v = Word2Vec.load(os.path.join("../output/", "GoogleNews-vectors-negative300.bin.gz"))
# model = KeyedVectors.load_word2vec_format("../output/GoogleNews-vectors-negative300.bin.gz", binary=True)
# model = Word2Vec(train["City"],  window=5, min_count=10, sg=1)
# model.save("../output/city.model")
# model = Word2Vec.load("../output/city.model")
# def word2vec(word):
#     try:
#         return w2v.wv[word]
#     except KeyError:
#         return "XXX"
# w2v_embeddings = train_df["City"].map(word2vec)
# city_vector_df = pd.DataFrame(w2v_embeddings.values.tolist(), index=train_df["City"])
# city_vector_df 

In [170]:
# One hot encoding 次元圧縮

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='infrequent_if_exist',min_frequency=100,max_categories=100,sparse_output=False)
train_df_city=enc.fit_transform(train_df[['City']])
test_df_city=enc.transform(test_df[['City']])
train_df_city=pd.DataFrame(train_df_city).add_prefix('City_col')
test_df_city=pd.DataFrame(test_df_city).add_prefix('City_col')
col_list=train_df_city.columns
train_df=pd.concat([train_df,train_df_city],axis=1)
test_df=pd.concat([test_df,test_df_city],axis=1)

train_df,test_df = create_svd(col_list,train_df,test_df,'City')
train_df,test_df = create_nmf(col_list,train_df,test_df,'City')
train_df.drop(col_list,axis=1,inplace=True)
test_df.drop(col_list,axis=1,inplace=True)

In [171]:
train_df

Unnamed: 0.1,Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,RevLineCr,LowDoc,DisbursementDate,MIS_Status,Sector,ApprovalDate,ApprovalFY,City,State,BankState,DisbursementGross,GrAppv,SBA_Appv,UrbanRural,RevLineCr_label,LowDoc_label,City_label,State_label,BankState_label,NoEmp_label,main_key_NoEmp_label,main_key_LowDoc_label,main_key_RevLineCr_label,main_key_State,main_key_City,DisbursementGross_main_key_NoEmp_label_med,DisbursementGross_main_key_NoEmp_label_diff,GrAppv_main_key_NoEmp_label_med,GrAppv_main_key_NoEmp_label_diff,SBA_Appv_main_key_NoEmp_label_med,SBA_Appv_main_key_NoEmp_label_diff,CreateJob_main_key_NoEmp_label_med,CreateJob_main_key_NoEmp_label_diff,RetainedJob_main_key_NoEmp_label_med,RetainedJob_main_key_NoEmp_label_diff,NoEmp_main_key_NoEmp_label_med,NoEmp_main_key_NoEmp_label_diff,Term_main_key_NoEmp_label_med,Term_main_key_NoEmp_label_diff,DisbursementGross_main_key_LowDoc_label_med,DisbursementGross_main_key_LowDoc_label_diff,GrAppv_main_key_LowDoc_label_med,GrAppv_main_key_LowDoc_label_diff,SBA_Appv_main_key_LowDoc_label_med,SBA_Appv_main_key_LowDoc_label_diff,CreateJob_main_key_LowDoc_label_med,CreateJob_main_key_LowDoc_label_diff,RetainedJob_main_key_LowDoc_label_med,RetainedJob_main_key_LowDoc_label_diff,NoEmp_main_key_LowDoc_label_med,NoEmp_main_key_LowDoc_label_diff,Term_main_key_LowDoc_label_med,Term_main_key_LowDoc_label_diff,DisbursementGross_main_key_RevLineCr_label_med,DisbursementGross_main_key_RevLineCr_label_diff,GrAppv_main_key_RevLineCr_label_med,GrAppv_main_key_RevLineCr_label_diff,SBA_Appv_main_key_RevLineCr_label_med,SBA_Appv_main_key_RevLineCr_label_diff,CreateJob_main_key_RevLineCr_label_med,CreateJob_main_key_RevLineCr_label_diff,RetainedJob_main_key_RevLineCr_label_med,RetainedJob_main_key_RevLineCr_label_diff,NoEmp_main_key_RevLineCr_label_med,NoEmp_main_key_RevLineCr_label_diff,Term_main_key_RevLineCr_label_med,Term_main_key_RevLineCr_label_diff,DisbursementGross_main_key_State_med,DisbursementGross_main_key_State_diff,GrAppv_main_key_State_med,GrAppv_main_key_State_diff,SBA_Appv_main_key_State_med,SBA_Appv_main_key_State_diff,CreateJob_main_key_State_med,CreateJob_main_key_State_diff,RetainedJob_main_key_State_med,RetainedJob_main_key_State_diff,NoEmp_main_key_State_med,NoEmp_main_key_State_diff,Term_main_key_State_med,Term_main_key_State_diff,DisbursementGross_main_key_City_med,DisbursementGross_main_key_City_diff,GrAppv_main_key_City_med,GrAppv_main_key_City_diff,SBA_Appv_main_key_City_med,SBA_Appv_main_key_City_diff,CreateJob_main_key_City_med,CreateJob_main_key_City_diff,RetainedJob_main_key_City_med,RetainedJob_main_key_City_diff,NoEmp_main_key_City_med,NoEmp_main_key_City_diff,Term_main_key_City_med,Term_main_key_City_diff,svd_City_truncatedsvd0,svd_City_truncatedsvd1,svd_City_truncatedsvd2,svd_City_truncatedsvd3,svd_City_truncatedsvd4,svd_City_truncatedsvd5,svd_City_truncatedsvd6,svd_City_truncatedsvd7,svd_City_truncatedsvd8,svd_City_truncatedsvd9,svd_City_truncatedsvd10,svd_City_truncatedsvd11,svd_City_truncatedsvd12,svd_City_truncatedsvd13,svd_City_truncatedsvd14,svd_City_truncatedsvd15,svd_City_truncatedsvd16,nmf_City_nmf0,nmf_City_nmf1,nmf_City_nmf2,nmf_City_nmf3,nmf_City_nmf4,nmf_City_nmf5,nmf_City_nmf6,nmf_City_nmf7,nmf_City_nmf8,nmf_City_nmf9,nmf_City_nmf10,nmf_City_nmf11,nmf_City_nmf12,nmf_City_nmf13,nmf_City_nmf14,nmf_City_nmf15,nmf_City_nmf16
0,0,163,21,1.0,0,0,1,N,N,1998-01-31,1,0,2006-09-22,2006,PHOENIX,AZ,SD,80000,80000,68000,0,1,3,1857,3,41,4,4_0,3_0,1_0,AZ_0,PHOENIX_0,100000.0,-20000.0,100000.0,-20000.0,82500.0,-14500.0,0.0,0.0,0.0,0.0,19.0,2.0,163.0,0.0,100000.0,-20000.0,100000.0,-20000.0,75000.0,-7000.0,0.0,0.0,0.0,0.0,4.0,17.0,82.0,81.0,100000.0,-20000.0,100000.0,-20000.0,80000.0,-12000.0,0.0,0.0,0.0,0.0,4.0,17.0,83.0,80.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,7.0,14.0,158.0,5.0,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,10.0,11.0,163.0,0.0,1.237041,0.774666,-0.790986,-1.940700,1.061481,-0.358730,0.186792,-1.300819,1.683088,-2.209036,0.856589,-0.701709,-1.534264,-0.534186,-0.426118,-0.347151,1.842108,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000e+00,4.592016e-19,0.0,3.620079e-19,0.0,0.000000,0.0,0.0,0.0,0.523391
1,1,84,6,1.0,4,0,0,0,N,1993-10-31,1,62,1992-06-30,1992,MCALESTER,OK,OK,287000,287000,229600,0,0,3,1451,36,36,3,3_62,3_62,0_62,XX,XX,121000.0,166000.0,100000.0,187000.0,80000.0,149600.0,0.0,4.0,0.0,0.0,6.0,0.0,83.0,1.0,138300.0,148700.0,129000.0,158000.0,105335.0,124265.0,0.0,4.0,0.0,0.0,5.0,1.0,83.0,1.0,100000.0,187000.0,96033.0,190967.0,64000.0,165600.0,0.0,4.0,0.0,0.0,3.0,3.0,82.0,2.0,99000.0,188000.0,82500.0,204500.0,56270.0,173330.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0,100000.0,187000.0,85000.0,202000.0,59500.0,170100.0,0.0,4.0,0.0,0.0,4.0,2.0,82.0,2.0,-1.270709,-0.006407,0.002430,0.000934,0.000142,0.000363,-0.000265,0.001184,0.000723,0.000629,0.000655,-0.001871,-0.001692,0.000734,0.000012,-0.000391,-0.000181,0.499588,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000
2,2,242,45,1.0,4,90,0,N,N,2001-08-31,1,42,2001-04-18,2001,HAWTHORNE,NJ,NJ,31983,30000,15000,1,1,3,1021,31,31,4,4_42,3_42,1_42,NJ_42,XX,121050.0,-89067.0,100000.0,-70000.0,65600.0,-50600.0,0.0,4.0,0.0,90.0,18.0,27.0,85.5,156.5,100000.0,-68017.0,86750.0,-56750.0,52600.0,-37600.0,0.0,4.0,0.0,90.0,5.0,40.0,82.0,160.0,120000.0,-88017.0,100000.0,-70000.0,75000.0,-60000.0,0.0,4.0,0.0,90.0,6.0,39.0,83.0,159.0,110000.0,-78017.0,100000.0,-70000.0,76500.0,-61500.0,0.0,4.0,0.0,90.0,12.0,33.0,119.0,123.0,100000.0,-68017.0,85000.0,-55000.0,59500.0,-44500.0,0.0,4.0,0.0,90.0,4.0,41.0,82.0,160.0,-1.270709,-0.006407,0.002430,0.000934,0.000142,0.000363,-0.000265,0.001184,0.000723,0.000629,0.000655,-0.001871,-0.001692,0.000734,0.000012,-0.000391,-0.000181,0.499588,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000
3,3,237,4,1.0,0,0,0,N,N,2007-08-31,1,33,2003-10-06,2004,NASHVILLE,TN,SD,229000,229000,229000,0,1,3,1610,42,41,2,2_33,3_33,1_33,TN_33,NASHVILLE_33,98000.0,131000.0,82050.0,146950.0,56000.0,173000.0,0.0,0.0,1.0,-1.0,4.0,0.0,73.0,164.0,94203.0,134797.0,75000.0,154000.0,50000.0,179000.0,0.0,0.0,1.0,-1.0,5.0,-1.0,82.0,155.0,107909.0,121091.0,100000.0,129000.0,64000.0,165000.0,0.0,0.0,0.0,0.0,6.0,-2.0,83.0,154.0,118000.0,111000.0,111600.0,117400.0,77925.0,151075.0,0.0,0.0,1.0,-1.0,5.5,-1.5,82.0,155.0,158300.0,70700.0,168300.0,60700.0,132000.0,97000.0,0.0,0.0,0.0,0.0,6.0,-2.0,83.0,154.0,1.062325,-0.182877,1.031850,0.708203,2.402452,-1.347394,-2.338421,0.141286,0.488348,1.536110,0.654663,1.140083,-0.455242,0.558266,-0.377918,-0.495054,-0.198883,0.000000,0.0,0.0,0.0,0.000000,1.604582,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000
4,4,184,0,1.0,0,0,0,N,N,1983-06-08,1,0,1999-12-17,2000,POMONA,CA,CA,525000,525000,393750,0,1,3,1893,4,4,1,1_0,3_0,1_0,CA_0,POMONA_0,80000.0,445000.0,80000.0,445000.0,68000.0,325750.0,0.0,0.0,0.0,0.0,1.0,-1.0,67.0,117.0,100000.0,425000.0,100000.0,425000.0,75000.0,318750.0,0.0,0.0,0.0,0.0,4.0,-4.0,82.0,102.0,100000.0,425000.0,100000.0,425000.0,80000.0,313750.0,0.0,0.0,0.0,0.0,4.0,-4.0,83.0,101.0,110000.0,415000.0,110000.0,415000.0,90000.0,303750.0,0.0,0.0,0.0,0.0,4.0,-4.0,81.0,103.0,275000.0,250000.0,275000.0,250000.0,243000.0,150750.0,0.0,0.0,0.0,0.0,3.0,-3.0,61.0,123.0,1.204653,0.054647,0.637403,-3.105276,-0.592963,-1.512547,2.075491,1.075297,0.250982,0.618715,1.149545,-0.058385,-0.516732,1.305988,0.082224,-0.138990,-0.813333,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.920928,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,42302,283,14,1.0,0,0,1,N,N,1998-01-31,1,0,1995-03-02,1995,PHILADELPHIA,PA,PA,80000,80000,68000,0,1,3,1856,38,38,4,4_0,3_0,1_0,PA_0,PHILADELPHIA_0,100000.0,-20000.0,100000.0,-20000.0,82500.0,-14500.0,0.0,0.0,0.0,0.0,19.0,-5.0,163.0,120.0,100000.0,-20000.0,100000.0,-20000.0,75000.0,-7000.0,0.0,0.0,0.0,0.0,4.0,10.0,82.0,201.0,100000.0,-20000.0,100000.0,-20000.0,80000.0,-12000.0,0.0,0.0,0.0,0.0,4.0,10.0,83.0,200.0,80000.0,0.0,80000.0,0.0,64000.0,4000.0,0.0,0.0,0.0,0.0,4.0,10.0,112.5,170.5,80000.0,0.0,80000.0,0.0,68000.0,0.0,0.0,0.0,0.0,0.0,6.0,8.0,167.0,116.0,1.191595,2.532730,0.166703,0.714481,-1.075569,-0.085102,-1.637467,1.996267,-0.206078,0.209960,0.076828,-0.928266,-1.293693,-0.240745,1.209944,0.291667,-0.478068,0.000000,0.0,0.0,0.0,1.678998,0.000000,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000
42303,42303,53,2,1.0,0,0,0,Y,N,1991-04-03,1,42,2007-06-06,2007,LOS ANGELES,CA,SD,5000,5000,4250,1,3,3,1349,4,41,1,1_42,3_42,3_42,CA_42,LOS ANGELES_42,61937.0,-56937.0,60000.0,-55000.0,50000.0,-45750.0,0.0,0.0,1.0,-1.0,1.0,1.0,81.0,-28.0,100000.0,-95000.0,86750.0,-81750.0,52600.0,-48350.0,0.0,0.0,0.0,0.0,5.0,-3.0,82.0,-29.0,50000.0,-45000.0,50000.0,-45000.0,25000.0,-20750.0,1.0,-1.0,2.0,-2.0,2.0,0.0,81.0,-28.0,80809.5,-75809.5,70000.0,-65000.0,48400.0,-44150.0,0.0,0.0,0.0,0.0,4.0,-2.0,81.0,-28.0,25000.0,-20000.0,25000.0,-20000.0,12500.0,-8250.0,8.0,-8.0,9.0,-9.0,2.0,0.0,80.5,-27.5,1.378330,0.896250,0.599111,-1.065045,-0.651233,1.014040,1.277602,-0.722188,1.262367,0.302978,4.856975,-1.221632,-1.329822,-0.494135,-0.555777,0.340680,-0.039382,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,2.563559
42304,42304,59,6,2.0,0,0,1,N,N,2003-02-28,1,42,2003-03-14,2003,COLUMBUS,OH,OH,60000,60000,51000,0,1,3,485,35,35,3,3_42,3_42,1_42,OH_42,XX,111500.0,-51500.0,99000.0,-39000.0,65600.0,-14600.0,0.0,0.0,0.0,0.0,6.0,0.0,82.0,-23.0,100000.0,-40000.0,86750.0,-26750.0,52600.0,-1600.0,0.0,0.0,0.0,0.0,5.0,1.0,82.0,-23.0,120000.0,-60000.0,100000.0,-40000.0,75000.0,-24000.0,0.0,0.0,0.0,0.0,6.0,0.0,83.0,-24.0,145000.0,-85000.0,131000.0,-71000.0,100000.0,-49000.0,0.0,0.0,0.0,0.0,5.0,1.0,83.0,-24.0,100000.0,-40000.0,85000.0,-25000.0,59500.0,-8500.0,0.0,0.0,0.0,0.0,4.0,2.0,82.0,-23.0,1.141482,1.057072,-0.722921,0.156288,-0.881434,-0.574716,1.349521,-0.055120,-0.189564,-0.643319,1.525537,-0.768710,-1.022489,-3.143918,-4.297614,0.539948,2.236142,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3.118361e-22,0.000000e+00,0.0,0.000000e+00,0.0,0.000119,0.0,0.0,0.0,0.000000
42305,42305,295,18,1.0,0,8,0,N,N,1997-12-10,1,42,1989-08-23,1989,CLOQUET,MN,MN,294000,294000,220500,0,1,3,458,23,23,4,4_42,3_42,1_42,MN_42,XX,121050.0,172950.0,100000.0,194000.0,65600.0,154900.0,0.0,0.0,0.0,8.0,18.0,0.0,85.5,209.5,100000.0,194000.0,86750.0,207250.0,52600.0,167900.0,0.0,0.0,0.0,8.0,5.0,13.0,82.0,213.0,120000.0,174000.0,100000.0,194000.0,75000.0,145500.0,0.0,0.0,0.0,8.0,6.0,12.0,83.0,212.0,130500.0,163500.0,105000.0,189000.0,91387.5,129112.5,0.0,0.0,0.0,8.0,5.5,12.5,83.0,212.0,100000.0,194000.0,85000.0,209000.0,59500.0,161000.0,0.0,0.0,0.0,8.0,4.0,14.0,82.0,213.0,-1.270709,-0.006407,0.002430,0.000934,0.000142,0.000363,-0.000265,0.001184,0.000723,0.000629,0.000655,-0.001871,-0.001692,0.000734,0.000012,-0.000391,-0.000181,0.499588,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.0,0.0,0.0,0.000000


In [172]:
del c,dataFrame,eda_df,eda_df_02,test_df_city,train_df_city
gc.collect()

53

# Create Model

In [173]:
train_df.columns

Index(['Unnamed: 0', 'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob',
       'FranchiseCode', 'RevLineCr', 'LowDoc', 'DisbursementDate',
       ...
       'nmf_City_nmf7', 'nmf_City_nmf8', 'nmf_City_nmf9', 'nmf_City_nmf10',
       'nmf_City_nmf11', 'nmf_City_nmf12', 'nmf_City_nmf13', 'nmf_City_nmf14',
       'nmf_City_nmf15', 'nmf_City_nmf16'],
      dtype='object', length=136)

In [174]:
drop_list = list(train_df.select_dtypes("object").columns)
drop_list.extend(train_df.select_dtypes("datetime").columns)
drop_list

['RevLineCr',
 'LowDoc',
 'City',
 'State',
 'BankState',
 'main_key_NoEmp_label',
 'main_key_LowDoc_label',
 'main_key_RevLineCr_label',
 'main_key_State',
 'main_key_City',
 'DisbursementDate',
 'ApprovalDate']

In [175]:
target_encode_list = ["State","BankState"]
for c in target_encode_list:
    drop_list.remove(c)

In [176]:
drop_list

['RevLineCr',
 'LowDoc',
 'City',
 'main_key_NoEmp_label',
 'main_key_LowDoc_label',
 'main_key_RevLineCr_label',
 'main_key_State',
 'main_key_City',
 'DisbursementDate',
 'ApprovalDate']

In [177]:
drop_list_other = ["Unnamed: 0","State_label","BankState_label","MIS_Status"]
[drop_list.append(c) for c in drop_list_other]

[None, None, None, None]

In [178]:
drop_list

['RevLineCr',
 'LowDoc',
 'City',
 'main_key_NoEmp_label',
 'main_key_LowDoc_label',
 'main_key_RevLineCr_label',
 'main_key_State',
 'main_key_City',
 'DisbursementDate',
 'ApprovalDate',
 'Unnamed: 0',
 'State_label',
 'BankState_label',
 'MIS_Status']

In [179]:
from sklearn.metrics import log_loss
##
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss/(N_0+N_1)
##
def objective(train_x,train_y,test_x,test_y,valid_index,train_df,trial):
        d_train=lgb.Dataset(train_x,label=train_y)
        d_valid=lgb.Dataset(test_x,label=test_y)
        param = {
        'objective': 'binary',#trial.suggest_categorical("objective",['binary', 'tweedie']),'num_class': 3
        'metric': 'binary_logloss',
        'learning_rate': 0.003,#trial.suggest_loguniform('learning_rate',0.006),
        'boosting':'gbdt',
#         'is_unbalance':True,
#         'scale_pos_weight':trial.suggest_loguniform('scale_pos_weight', 1,5),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'max_depth':trial.suggest_int('max_depth', 1, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 2, 100),
        'early_stopping_rounds':500,
        'seed':1,'verbose' : -1,'num_boost_round':100000,
        'tree_learner':trial.suggest_categorical("tree_learner",['serial', 'feature', 'data', 'voting']),
        # 'data_sample_strategy':trial.suggest_categorical("data_sample_strategy",['goss', 'bagging'])
    }
 
        gbm = lgb.train(param,d_train,valid_sets = [d_train, d_valid],callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True), lgb.log_evaluation(300)])
        preds = gbm.predict(test_x)
        
#         oof_predictions = np.zeros([len(train_df),2])
#         oof_predictions[valid_index]=preds
#         y_true=np.zeros([len(test_y),3])
#         y_true[:, 0]=np.where(test_y==0,1,0)
#         y_true[:, 1]=np.where(test_y==1,1,0)
#         y_true[:, 2]=np.where(test_y==2,1,0)
#         print(gbm.predict(test_x))
#         print(preds[:,0])
#         print(y_true[:,0])
#         print(preds.shape)
        score=log_loss(test_y,preds)
#         score=log_loss(test_y,preds)
        return score

In [181]:
mlflow.set_tracking_uri('./mlruns/')
mlflow.start_run(description=hash)


<ActiveRun: >

In [182]:
target_encode_list = ["State","BankState"]

In [183]:
drop_list.append("UrbanRural")
features=[c for c in train_df.columns if c not in drop_list]
# if CFG.model=='lgb':
import lightgbm as lgb
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import KFold
importance_df=pd.DataFrame(index=train_df[features].columns)
importance_df['importance']=0
fold=5
predictions=pd.DataFrame()
predict=np.zeros([len(test_df),])
oof_predictions = np.zeros([len(train_df),])
# Target Encoding for state and bank state test data  
for c in target_encode_list:
    tmp_df = pd.DataFrame({c:train_df[c],f"target_{c}":train_df["MIS_Status"]}) 
    target_mean = tmp_df.groupby(c)[f"target_{c}"].mean()
    test_df.loc[:,c] = test_df[c].map(target_mean)
    test_df[c] = test_df[c].astype(float)
    
for seeds in[1]:
    fold=5
    skf=MultilabelStratifiedKFold(n_splits=5,random_state=seeds,shuffle=True)
    selected=features.copy()
        
    for i ,(tr_index,va_index) in enumerate(skf.split(train_df,train_df[['MIS_Status','Sector','State','UrbanRural']])):


        tr_x,tr_y=train_df[selected].iloc[tr_index],train_df['MIS_Status'].iloc[tr_index]
        va_x,va_y=train_df[selected].iloc[va_index],train_df['MIS_Status'].iloc[va_index]

        # Target Encoding for state and bank state 
        for c in target_encode_list:
            tmp_df = pd.DataFrame({c:tr_x[c],f"target_{c}":tr_y}) 
            target_mean = tmp_df.groupby(c)[f"target_{c}"].mean()
            va_x.loc[:,c] = va_x[c].map(target_mean)

            tmp = np.repeat(np.nan, tr_x.shape[0])
            kf_encoding = KFold(n_splits=4, shuffle =True,random_state=seeds)

            for idx_1,idx_2 in kf_encoding.split(tr_x):
                target_mean = tmp_df.iloc[idx_1].groupby(c)[f"target_{c}"].mean()
                tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)
            tr_x.loc[:,c]=tmp

            tr_x[c] = tr_x[c].astype(float)
            va_x[c] = va_x[c].astype(float)
        d_train_df=lgb.Dataset(tr_x,label=tr_y)

        d_valid=lgb.Dataset(va_x,label=va_y)
        if CFG.train==False:
            param = {'objective': 'binary','metric': 'binary_logloss','learning_rate': 0.003,'boosting':'gbdt','scale_pos_weight': 1, 'lambda_l1': 1.8657458525452022, 'lambda_l2': 0.0043319561423214945, 'num_leaves': 20, 'feature_fraction': 0.23875565259514844, 'bagging_fraction': 0.7939326324321189, 'bagging_freq': 2, 'min_child_samples': 73, 'tree_learner': 'feature','early_stopping_rounds':500,
             'seed':seeds,'random_state':seeds,'data_random_seed':seeds,'feature_fraction_seed':seeds,'random_seed':seeds,'verbose' : -1,'num_boost_round':100000}

            gbm = lgb.train(param, d_train_df,valid_sets = [d_train_df, d_valid],callbacks=[lgb.early_stopping(stopping_rounds=500, verbose=True), lgb.log_evaluation(100)])
            preds=gbm.predict(va_x[selected])
            predict += gbm.predict(test_df[selected])/5
            oof_predictions[va_index]=preds
            importance_df['importance']+=gbm.feature_importance(importance_type='gain')/5
        else:
            if i==0:
                print(CFG.train)
                optuna.logging.set_verbosity(optuna.logging.INFO)
                study = optuna.create_study(direction='minimize')
                study.optimize(functools.partial(objective,tr_x,tr_y,va_x,va_y,va_index,train_df),timeout=60*60*3)#n_trials = 1000)
                print('Number of finished trials:', len(study.trials))
                print('Best trial:', study.best_trial.params)



Training until validation scores don't improve for 500 rounds
[100]	training's binary_logloss: 0.316228	valid_1's binary_logloss: 0.318538
[200]	training's binary_logloss: 0.303218	valid_1's binary_logloss: 0.306779
[300]	training's binary_logloss: 0.29509	valid_1's binary_logloss: 0.299723
[400]	training's binary_logloss: 0.289487	valid_1's binary_logloss: 0.294956
[500]	training's binary_logloss: 0.285346	valid_1's binary_logloss: 0.291605
[600]	training's binary_logloss: 0.282149	valid_1's binary_logloss: 0.289235
[700]	training's binary_logloss: 0.279562	valid_1's binary_logloss: 0.287496
[800]	training's binary_logloss: 0.277359	valid_1's binary_logloss: 0.286173
[900]	training's binary_logloss: 0.275424	valid_1's binary_logloss: 0.285107
[1000]	training's binary_logloss: 0.273682	valid_1's binary_logloss: 0.28423
[1100]	training's binary_logloss: 0.272161	valid_1's binary_logloss: 0.283582
[1200]	training's binary_logloss: 0.270735	valid_1's binary_logloss: 0.283078
[1300]	traini



Training until validation scores don't improve for 500 rounds
[100]	training's binary_logloss: 0.317007	valid_1's binary_logloss: 0.315715
[200]	training's binary_logloss: 0.304288	valid_1's binary_logloss: 0.302973
[300]	training's binary_logloss: 0.29628	valid_1's binary_logloss: 0.295265
[400]	training's binary_logloss: 0.290723	valid_1's binary_logloss: 0.290156
[500]	training's binary_logloss: 0.286589	valid_1's binary_logloss: 0.286644
[600]	training's binary_logloss: 0.28338	valid_1's binary_logloss: 0.28413
[700]	training's binary_logloss: 0.280801	valid_1's binary_logloss: 0.282249
[800]	training's binary_logloss: 0.278596	valid_1's binary_logloss: 0.280834
[900]	training's binary_logloss: 0.276645	valid_1's binary_logloss: 0.279657
[1000]	training's binary_logloss: 0.274887	valid_1's binary_logloss: 0.278832
[1100]	training's binary_logloss: 0.273333	valid_1's binary_logloss: 0.278161
[1200]	training's binary_logloss: 0.271881	valid_1's binary_logloss: 0.277669
[1300]	trainin



Training until validation scores don't improve for 500 rounds
[100]	training's binary_logloss: 0.315791	valid_1's binary_logloss: 0.318738
[200]	training's binary_logloss: 0.302576	valid_1's binary_logloss: 0.307576
[300]	training's binary_logloss: 0.29425	valid_1's binary_logloss: 0.300881
[400]	training's binary_logloss: 0.288491	valid_1's binary_logloss: 0.29662
[500]	training's binary_logloss: 0.284204	valid_1's binary_logloss: 0.293728
[600]	training's binary_logloss: 0.280863	valid_1's binary_logloss: 0.29173
[700]	training's binary_logloss: 0.27819	valid_1's binary_logloss: 0.290371
[800]	training's binary_logloss: 0.275944	valid_1's binary_logloss: 0.289446
[900]	training's binary_logloss: 0.27395	valid_1's binary_logloss: 0.288702
[1000]	training's binary_logloss: 0.272184	valid_1's binary_logloss: 0.288098
[1100]	training's binary_logloss: 0.270621	valid_1's binary_logloss: 0.287694
[1200]	training's binary_logloss: 0.269194	valid_1's binary_logloss: 0.287378
[1300]	training'



Training until validation scores don't improve for 500 rounds
[100]	training's binary_logloss: 0.316263	valid_1's binary_logloss: 0.31721
[200]	training's binary_logloss: 0.303178	valid_1's binary_logloss: 0.305082
[300]	training's binary_logloss: 0.294984	valid_1's binary_logloss: 0.297972
[400]	training's binary_logloss: 0.289437	valid_1's binary_logloss: 0.293537
[500]	training's binary_logloss: 0.285289	valid_1's binary_logloss: 0.290445
[600]	training's binary_logloss: 0.282055	valid_1's binary_logloss: 0.288249
[700]	training's binary_logloss: 0.279438	valid_1's binary_logloss: 0.286679
[800]	training's binary_logloss: 0.277231	valid_1's binary_logloss: 0.285506
[900]	training's binary_logloss: 0.275283	valid_1's binary_logloss: 0.284509
[1000]	training's binary_logloss: 0.27354	valid_1's binary_logloss: 0.283699
[1100]	training's binary_logloss: 0.271987	valid_1's binary_logloss: 0.283142
[1200]	training's binary_logloss: 0.270553	valid_1's binary_logloss: 0.282644
[1300]	traini



Training until validation scores don't improve for 500 rounds
[100]	training's binary_logloss: 0.316538	valid_1's binary_logloss: 0.317762
[200]	training's binary_logloss: 0.303598	valid_1's binary_logloss: 0.305567
[300]	training's binary_logloss: 0.295615	valid_1's binary_logloss: 0.298265
[400]	training's binary_logloss: 0.290015	valid_1's binary_logloss: 0.293379
[500]	training's binary_logloss: 0.285878	valid_1's binary_logloss: 0.289873
[600]	training's binary_logloss: 0.282668	valid_1's binary_logloss: 0.287388
[700]	training's binary_logloss: 0.280073	valid_1's binary_logloss: 0.285537
[800]	training's binary_logloss: 0.277862	valid_1's binary_logloss: 0.284166
[900]	training's binary_logloss: 0.275878	valid_1's binary_logloss: 0.283035
[1000]	training's binary_logloss: 0.274112	valid_1's binary_logloss: 0.282155
[1100]	training's binary_logloss: 0.272552	valid_1's binary_logloss: 0.281469
[1200]	training's binary_logloss: 0.271114	valid_1's binary_logloss: 0.280899
[1300]	trai

In [184]:
importance_df.sort_values('importance',ascending=False).to_csv(f'importance_{CFG.ver}.csv')

In [185]:
oof_predictions

array([0.9846151 , 0.93859804, 0.97017566, ..., 0.91102224, 0.97153602,
       0.9277209 ])

In [186]:
mlflow.log_param("importance ",importance_df)
mlflow.log_param("feature ",features)

['Term',
 'NoEmp',
 'NewExist',
 'CreateJob',
 'RetainedJob',
 'FranchiseCode',
 'Sector',
 'ApprovalFY',
 'State',
 'BankState',
 'DisbursementGross',
 'GrAppv',
 'SBA_Appv',
 'RevLineCr_label',
 'LowDoc_label',
 'City_label',
 'NoEmp_label',
 'DisbursementGross_main_key_NoEmp_label_med',
 'DisbursementGross_main_key_NoEmp_label_diff',
 'GrAppv_main_key_NoEmp_label_med',
 'GrAppv_main_key_NoEmp_label_diff',
 'SBA_Appv_main_key_NoEmp_label_med',
 'SBA_Appv_main_key_NoEmp_label_diff',
 'CreateJob_main_key_NoEmp_label_med',
 'CreateJob_main_key_NoEmp_label_diff',
 'RetainedJob_main_key_NoEmp_label_med',
 'RetainedJob_main_key_NoEmp_label_diff',
 'NoEmp_main_key_NoEmp_label_med',
 'NoEmp_main_key_NoEmp_label_diff',
 'Term_main_key_NoEmp_label_med',
 'Term_main_key_NoEmp_label_diff',
 'DisbursementGross_main_key_LowDoc_label_med',
 'DisbursementGross_main_key_LowDoc_label_diff',
 'GrAppv_main_key_LowDoc_label_med',
 'GrAppv_main_key_LowDoc_label_diff',
 'SBA_Appv_main_key_LowDoc_label_med'

In [187]:
oof=pd.DataFrame(oof_predictions)
oof.rename(columns={0:'prob1'},inplace=True)
oof['True']=train['MIS_Status']

# Value Metrics

In [188]:
oof

Unnamed: 0,prob1,True
0,0.984615,1
1,0.938598,1
2,0.970176,1
3,0.962542,1
4,0.944372,1
...,...,...
42302,0.985402,1
42303,0.935036,1
42304,0.911022,1
42305,0.971536,1


In [193]:
oof

Unnamed: 0,prob1,True,thresh
0,0.984615,1,1
1,0.938598,1,1
2,0.970176,1,1
3,0.962542,1,1
4,0.944372,1,1
...,...,...,...
42302,0.985402,1,1
42303,0.935036,1,1
42304,0.911022,1,1
42305,0.971536,1,1


In [195]:
from sklearn.metrics import f1_score
score_df = pd.DataFrame(columns = ["score","thresh"])

score_list = [{"Thresh":"Score"}]
for thresh in[0.6,0.7,0.74,0.745,0.746,0.747,0.748,0.749,0.75,0.751,0.752,0.753,0.754,0.755,0.76,0.761,0.762,0.763,0.767,0.768,0.769,0.77,0.771,0.772,0.773,0.774,0.78,0.8,0.9]:
    oof["thresh"] = 0
    oof['thresh']=np.where(oof['prob1']>thresh,1,0)
    print("#######")
    print(thresh)
    print(f1_score(oof['thresh'],oof['True'],average='macro'))
    score =f1_score(oof['thresh'],oof['True'],average='macro')
    score_df.loc[len(score_df)] = [score,thresh]
    print("#######")

#######
0.6
0.6570548582305474
#######
#######
0.7
0.672338214963242
#######
#######
0.74
0.6746762800331827
#######
#######
0.745
0.6750591268103676
#######
#######
0.746
0.6758573686093331
#######
#######
0.747
0.6760915986441756
#######
#######
0.748
0.6764077743472285
#######
#######
0.749
0.6766463983512832
#######
#######
0.75
0.6768337278843461
#######
#######
0.751
0.6771467352807092
#######
#######
0.752
0.6772895089475526
#######
#######
0.753
0.6774313979109796
#######
#######
0.754
0.6770426525423816
#######
#######
0.755
0.6770748208785403
#######
#######
0.76
0.676810819929494
#######
#######
0.761
0.6764931517545925
#######
#######
0.762
0.6764495396642486
#######
#######
0.763
0.6762552567447994
#######
#######
0.767
0.6758645978101921
#######
#######
0.768
0.6757625193541209
#######
#######
0.769
0.675805863352396
#######
#######
0.77
0.6759791927196777
#######
#######
0.771
0.6764330820666771
#######
#######
0.772
0.6766097136818598
#######
#######
0.773
0.67618568269

In [201]:
max_score = score_df[score_df["score"]==score_df["score"].max()]

In [202]:
mlflow.log_metric("score val",max_score["score"])

Unnamed: 0,score,thresh
11,0.677431,0.753


In [203]:
mlflow.end_run()

# Create Submission

In [99]:
pred_df=pd.DataFrame(predict)
pred_df=pred_df.add_prefix('col_')
sub=pd.DataFrame(test['Unnamed: 0'].copy())
# sub['predict_2']=0
sub['predict_0']=0
sub['predict_1']=0
sub['predict_1']=np.where(pred_df['col_0']>0.75,1,0)

# # sub['predict_2']=np.where(tmp['col_2']>0.0615,1,0)
# # sub['predict_0']=np.where(tmp['col_0']>0.205,1,0)

In [101]:
sub['predict_1'].value_counts()

predict_1
1    39043
0     3265
Name: count, dtype: int64

In [102]:
sub[['Unnamed: 0','predict_1']].to_csv(f'sub_kaminogo{CFG.ver}.csv',header=False,index=False)

# Other 

In [273]:
population_df= pd.read_csv("../data/us_popluation.csv")

In [262]:
for c in [1910,1920,1930,1940,1950,1960,1970,1980]:
    population_df.drop(f"{c}",axis=1,inplace=True)

In [241]:
population_df["2024_per"] = (population_df["2024"]/population_df["2024"].sum())*100

In [254]:
for c in ["2024","2023","2010","1900","1910","1920","1930","1940","1950","1960","1970","1980","1990","2000","2003"]:
    population_df[f"{c}_per"] = (population_df[f"{c}"]/population_df[f"{c}"].sum())*100

In [265]:
population_df.to_csv("../data/us_population.csv",index=False)

In [291]:
def create_popflag(input_df,pop_df,year):
    pop_df = pop_df.sort_values(f"{year}",ascending=False)
    pop_df[f"{year}_flag"] = pd.qcut(pop_df[f"{year}"],4,labels=False)
    pop_df[f"{year}_flag"] = pop_df[f"{year}_flag"]+1
    input_df = pd.merge(input_df, pop_df[[f"{year}_flag","State"]],how="left",on="State")

    return input_df[f"{year}_flag"]


In [269]:
for year in ["1990","2000","2010","2020"]:
    population_df = population_df.sort_values(f"{year}_per")
    input_df ~ 
    population_df.loc[] = pd.qcut(population_df["2024"],4,labels=False)

In [271]:
population_df.loc[:,25]

AttributeError: 'NoneType' object has no attribute 'loc'

In [278]:
population_df = population_df.sort_values("2024",ascending=False)

In [287]:
# qcut
population_df["qcut"] = pd.qcut(population_df["2024"],4,labels=False)
population_df["qcut"] = population_df["qcut"] + 1

In [288]:
population_df

Unnamed: 0,2024,per_2024,2023,2010,State,1900,1910,1920,1930,1940,...,per_1930,per_1940,per_1950,per_1960,per_1970,per_1980,per_1990,per_2000,per_2003,qcut
0,38889770,11.58,38965193,37319550,CA,1485053,2377549,3426861,5677251,6907387,...,0.046263,0.052527,0.070329,0.088022,0.098556,0.104768,0.11995,0.120604,0.122256,4
1,30976754,9.22,30503301,25241897,TX,3048710,3896542,4663228,5824715,6414824,...,0.047465,0.048781,0.051229,0.05365,0.055305,0.062987,0.068466,0.074245,0.076206,4
2,22975931,6.84,22610726,18846143,FL,528542,752619,968470,1468211,1897414,...,0.011964,0.014429,0.018411,0.027731,0.033535,0.043143,0.052147,0.056907,0.058637,4
3,19469232,5.8,19571216,19399956,NY,7268894,9113614,10385227,12588066,13479142,...,0.102579,0.102502,0.098524,0.093987,0.090079,0.077722,0.072512,0.067568,0.066117,4
4,12951275,3.86,12961683,12711406,PA,6302115,7665111,8720017,9631350,9900180,...,0.078485,0.075286,0.069743,0.063393,0.058254,0.052517,0.04789,0.043728,0.042603,4
5,12516863,3.73,12549689,12840545,IL,4821550,5638591,6485280,7630654,7897241,...,0.062182,0.060054,0.057879,0.056458,0.054896,0.050581,0.046072,0.04422,0.043596,4
6,11812173,3.52,11785935,11539449,OH,4157545,4767121,5759394,6646697,6907612,...,0.054163,0.052529,0.052793,0.05436,0.052614,0.047797,0.04372,0.040424,0.0394,4
7,11145304,3.32,11029227,9712209,GA,2216331,2609121,2895832,2908506,3123723,...,0.023701,0.023754,0.022884,0.022083,0.02267,0.024183,0.026111,0.029149,0.029922,4
8,10975017,3.27,10835491,9574586,NC,1893810,2206287,2559123,3170276,3571623,...,0.025834,0.02716,0.026985,0.025516,0.025102,0.026036,0.026717,0.028661,0.028966,4
9,10041241,2.99,10037261,9877597,MI,2420982,2810173,3668412,4842325,5256106,...,0.03946,0.03997,0.042331,0.043813,0.043837,0.040999,0.037465,0.035387,0.034729,4
