In [None]:
!pip install git+https://github.com/h2oai/pystacknet

In [None]:
!pip install GML

In [None]:
!pip install autofeat

In [None]:
!pip install sweetviz

In [None]:
!pip install scikit-learn==0.23.1
# !pip install scikit-learn==0.22

In [None]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression, LassoLars, Ridge

# import sweetviz as sv

import matplotlib.pyplot as plt

import seaborn as sns

from xgboost import XGBRegressor

from lightgbm import LGBMRegressor

import tqdm

# from autofeat import AutoFeatRegressor

# from pystacknet.pystacknet import StackNetRegressor

# from GML.Ghalat_Machine_Learning import Ghalat_Machine_Learning

In [None]:
def accuracy(y_true,y_pred):
  return 100 * max(0,1-np.sqrt(mean_squared_error(y_true,y_pred)))

In [None]:
path = '/kaggle/input/he-mlcomp/'

In [None]:
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')

In [None]:
test_eid = test['Employee_ID'].copy()

In [None]:
train['Employee_ID'] = train['Employee_ID'].apply(lambda x: int(x.split('_')[1]))
test['Employee_ID'] = test['Employee_ID'].apply(lambda x: int(x.split('_')[1]))

In [None]:
test.sort_values('Age',inplace=True)
train.sort_values('Age',inplace=True)

In [None]:
for col in train.columns:
    if len(np.unique(train[col])) < 6000:
        print(train.groupby(col)['Attrition_rate'].mean())

In [None]:
my_report = sv.analyze(train)
my_report.show_html()

In [None]:
my_report = sv.compare([train,'Train'],[test,'Test'],'Attrition_rate')
my_report.show_html()

In [None]:
train.groupby('Post_Level')['Attrition_rate'].mean()

In [None]:
avg_att_dic = dict(train.groupby('Travel_Rate')['Attrition_rate'].mean())

In [None]:
def avg_att(var):
    try:
        return avg_att_dic(var)
    except:
        return train['Attrition_rate'].mean()

In [None]:
def growth_rate(s):
    if s['growth_rate'] <= 30:
        return 'Low'
    elif s['growth_rate'] <= 55:
        return 'Medium'
    else:
        return 'High'

In [None]:
tr1 = train['Travel_Rate'].apply(avg_att)
te1 = test['Travel_Rate'].apply(avg_att)

In [None]:
train['growth_rate_level'] = train.apply(growth_rate,axis=1)
test['growth_rate_level'] = train.apply(growth_rate,axis=1)

In [None]:
grid_lgbm = {
    'max_depth':16,
    'learning_rate':0.002,
    'n_estimators':1704,
    'min_child_weight':17,
    'eta':0.364
}

In [None]:
cols = train.columns[train.dtypes == 'object']

for col in cols:
  dummies = pd.get_dummies(train[col])
  train.drop([col],axis=1,inplace=True)
  train = pd.concat([train,dummies],axis=1)
  dummies = pd.get_dummies(test[col])
  test.drop([col],axis=1,inplace=True)
  test = pd.concat([test,dummies],axis=1)

for col in test.columns:
    if train[col].isnull().any():
        model = LGBMRegressor(**grid_lgbm)
        t1 = train[~pd.isna(train[col])].copy()
        t1.drop(['Attrition_rate'],axis=1,inplace=True)
        t2 = train[pd.isna(train[col])].copy()
        t2.drop(['Attrition_rate','Age'],axis=1,inplace=True)
        t3 = test[pd.isna(test[col])].copy()
        t3.drop(['Age'],axis=1,inplace=True)
        model.fit(t1.drop([col],axis=1),t1[col])
        train.loc[t2.index,col] = model.predict(t2)
        test.loc[t3.index,col] = model.predict(t3)

In [None]:
funcs = [np.add, np.subtract, np.multiply, np.divide]

for i, func in enumerate(funcs):
    train['Age_TOS'+str(i+1)] = func(train['Age'],train['Time_of_service'])
    test['Age_TOS'+str(i+1)] = func(test['Age'],test['Time_of_service'])

    train['AT_GR'+str(i+1)] = func(train['Age_TOS'+str(i+1)], train['growth_rate'])
    test['AT_GR'+str(i+1)] = func(test['Age_TOS'+str(i+1)], test['growth_rate'])

    train['TS_GR'+str(i+1)] = func(train['Time_of_service'], train['growth_rate'])
    test['TS_GR'+str(i+1)] = func(test['Time_of_service'], test['growth_rate'])

    train['PS_WLB'+str(i+1)] = func(train['Pay_Scale'], train['Work_Life_balance'])
    test['PS_WLB'+str(i+1)] = func(test['Pay_Scale'], test['Work_Life_balance'])

    train['Age_TSP'+str(i+1)] = func(train['Age'], train['Time_since_promotion'])
    test['Age_TSP'+str(i+1)] = func(test['Age'],test['Time_since_promotion'])

    train['Age_EMP'+str(i+1)] = func(train['Age'], train['Employee_ID'])
    test['Age_EMP'+str(i+1)] = func(test['Age'], test['Employee_ID'])

    train['TOS_EMP'+str(i+1)] = func(train['Time_of_service'],train['Employee_ID'])
    test['TOS_EMP'+str(i+1)] = func(test['Time_of_service'],test['Employee_ID'])

    train['ATOS_EMP'+str(i+1)] = func(train['Age_TOS'+str(i+1)], train['Employee_ID'])
    test['ATOS_EMP'+str(i+1)] = func(test['Age_TOS'+str(i+1)], test['Employee_ID'])
    
for i,col in enumerate(test.columns):
    test[col+str(i+1)] = test[col]**2
    test[col+str(i+10)] = test[col]**3
    train[col+str(i+1)] = train[col]**2
    train[col+str(i+10)] = train[col]**3


var_names = ['VAR'+str(i) for i in range(1,8)]
sums_tr = []
sums_ts = []
for row in train[var_names].itertuples():
    sums_tr.append(row[1]+row[2]+row[3]+row[4]+row[5]+row[6]+row[7])
    
for row2 in test[var_names].itertuples():
    sums_ts.append(row2[1]+row2[2]+row2[3]+row2[4]+row2[5]+row2[6]+row2[7])
    
train['sum_VARS'] = sums_tr
test['sum_VARS'] = sums_ts

In [None]:
grid_bayesian = {
    'n_iter':10000,
    'tol':6.42e-06,
    'alpha_1':0.0001,
    'alpha_2':6.58e-05,
    'lambda_1':0.999,
    'lambda_2':2.24e-05,
    'compute_score':True,
    'fit_intercept':True,
    'normalize':False
}

grid_ridge = {
    'max_iter':90000,
    'alpha': 0.23143352151759083, 
    'fit_intercept': True, 
    'normalize': False, 
    'tol': 0.0005243805101392493, 
    'solver': 'saga'
}

In [None]:
X = train.drop(['Attrition_rate'],axis=1)
y = train['Attrition_rate'].copy().values

In [None]:
X = X.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

In [None]:
for cols in X.columns:
    X[cols].fillna(X[cols].mean(),inplace=True)
    test[cols].fillna(X[cols].mean(),inplace=True)

In [None]:
lgbm = LGBMRegressor(**grid_lgbm)
lgbm.fit(X, y)

fi = pd.DataFrame()
fi['A'] = X.columns.values
fi['B'] = lgbm.feature_importances_

In [None]:
fi

In [None]:
cols_to_d = fi[fi['B'] < 100].loc[:,'A']

In [None]:
X.drop(cols_to_d,axis=1,inplace=True)
test.drop(cols_to_d,axis=1,inplace=True)

In [None]:
X['avg_ATT'] = tr1
test['avg_ATT'] = te1

In [None]:
afg = AutoFeatRegressor(verbose=True, n_jobs=-1)

In [None]:
X_ = afg.fit_transform(X.values,y)

In [None]:
X_.to_csv('X_modified.csv',index=False)

In [None]:
test_ = afg.transform(test.values)

In [None]:
test_.to_csv('test_modified.csv',index=False)

In [None]:
path1 = '/kaggle/input/hemlcomp-x-test-new/'

X_ = pd.read_csv(path1+'X_modified.csv')
test_ = pd.read_csv(path1+'test_modified.csv')

In [None]:
estimators = [
    ('BR2',BayesianRidge(**grid_bayesian)), 
    ('Ridge2',Ridge(**grid_ridge))
]

In [None]:
gml = Ghalat_Machine_Learning(n_estimators = 2000)
gml.GMLRegressor(X_, y, accuracy, test_Size = 0.5,
                 neural_net = 'Yes', epochs = 100, verbose = 0)

In [None]:
br = BayesianRidge(**grid_bayesian)
ridge = Ridge(**grid_ridge)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mean_squared_error

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',dtype='float32'))
model.add(Dropout(0.1))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64,activation='relu'))
model.add(Dense(1))
model.compile(optimizer = Adam(),loss = 'mae')

In [None]:
scores = []
for tr_in, val_in in tqdm.tqdm(KFold().split(X_, y)):
    model.fit(X_.iloc[tr_in].values,y[tr_in],epochs=50,verbose=0)
    preds = model.predict(X_.iloc[val_in].values)
    preds = preds.reshape((preds.shape[0],))
    scores.append(accuracy(y[val_in],preds))

In [None]:
scores = []
for tr_in, val_in in tqdm.tqdm(KFold().split(X, y)):
    br.fit(X.iloc[tr_in], y[tr_in])
    ridge.fit(X.iloc[tr_in], y[tr_in])
    scores.append(accuracy(y[val_in],
                           br.predict(X.iloc[val_in])*0.8 + 
                           ridge.predict(X.iloc[val_in])*0.2))

In [None]:
scores

In [None]:
np.mean(scores)

In [None]:
preds = br.predict(test)*0.8 + ridge.predict(test)*0.2
ss = pd.DataFrame()
ss['Employee_ID'] = test_eid
ss['Attrition_rate'] = preds

ss.to_csv('submission.csv',index=False)

In [None]:
ss

Stack

In [None]:
stack = [
    [LinearRegression() for i in range(100)],
    [LassoLars() for i in range(100)],
    [BayesianRidge(**grid_bayesian) for i in range(100)],
    [BayesianRidge(**grid_bayesian) for i in range(1)]
]

In [None]:
model=StackNetRegressor(stack, folds=5,
	restacking=False,use_retraining=True, 
	random_state=12345,n_jobs=1)

In [None]:
scores = []
for tr_in, val_in in tqdm.tqdm(KFold().split(X, y)):
    model.fit(X.iloc[tr_in], y[tr_in])
    scores.append(accuracy(y[val_in],model.predict(X.iloc[val_in])))

In [None]:
scores

In [None]:
np.mean(scores)

In [None]:
preds = model.predict(test)

In [None]:
ss = pd.DataFrame()
ss['Employee_ID'] = test_eid
ss['Attrition_rate'] = preds

ss.to_csv('submission.csv',index=False)

In [None]:
ss