In [12]:
import pandas as pd, matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

from sklearn import ensemble
from sklearn import tree
import xgboost as xgb

In [2]:
data = pd.read_csv('data/application/train.csv', index_col=['TransactionID'])
data.drop(['TransactionDT'], axis=1, inplace=True)

Unnamed: 0_level_0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,0,15.0,H,2803,100.0,150.0,visa,226.0,debit,337.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2987010,0,75.887,C,16496,352.0,117.0,mastercard,134.0,credit,,...,chrome 62.0,,,,F,F,T,T,desktop,Windows


### data preprocessing

In [None]:
def get_all_categorical_features():
    features = [
        'ProductCD',
        'addr1', 
        'addr2',
        'P_emaildomain',
        'R_emaildomain',
        'DeviceType',
        'DeviceInfo',
    ]
    for i in range(1,7):
        features.append(f'card{i}')
    for i in range(1,10):
        features.append(f'M{i}')
    for i in range(12,39):
        features.append(f'id_{i}')
    return features

def correcting_data_types(data, features):
    for i in data:
        if i in features:
            data[i] = data[i].astype("string")
        else:
            data[i] = pd.to_numeric(data[i])
    return data

def fill_missing_values(data):
    for column in  data:
        if data[column].dtype.name == 'string':
            data[column].fillna('not available', inplace=True)
        else:
            mean = data[column].mean()
            if str(mean) == 'nan':
                data[column].fillna(0, inplace=True)
            else:
                data[column].fillna(mean, inplace=True)
    return data

def label_encode_categorical_features(data, features):
    encoder = preprocessing.LabelEncoder()
    for category in features:
        data[category] = encoder.fit_transform(data[category])
    return data


def cleaning_data(data, categorical_features):
    data = correcting_data_types(data=data, features=categorical_features)
    data = fill_missing_values(data=data)
    data = label_encode_categorical_features(data, categorical_features)
    na_cols = [col for col in data.columns if data[col].isna().unique()[0] == True and len(data[col].isna().unique()) == 1]
    data = data.drop(na_cols, axis=1)
    return data

clean_data = cleaning_data(data, get_all_categorical_features())
del data

In [21]:
clean_data.head(3)

Unnamed: 0_level_0,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0,50.0,1,5786,399,39,2,2,1,178,...,124,3,164,3,1,0,1,1,1,954
2987008,0,15.0,1,4968,0,39,4,97,2,133,...,98,3,48,2,1,0,0,1,1,1727
2987010,0,75.887,0,3487,243,10,2,28,1,255,...,44,4,260,4,0,0,1,1,0,1598


In [4]:
smote = RandomUnderSampler()
x, y = smote.fit_resample(clean_data.drop(['isFraud'], axis=1), clean_data['isFraud'])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, shuffle=True)

### model design

In [7]:

forest.fit(x_train, y_train)
y_pred = forest.predict(x_test)
metrics.f1_score(y_test, y_pred)

0.8757267441860466

In [11]:


decision.fit(x_train, y_train)
y_pred = decision.predict(x_test)
metrics.f1_score(y_test, y_pred)

0.8408733624454149

In [19]:
classifier = xgb.XGBClassifier(use_label_encoder=False, n_estimators=500,
                        n_jobs=4,
                        max_depth=9,
                        learning_rate=0.05,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        missing=-999)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print(f'F1 Score {metrics.f1_score(y_test, y_pred)}')
print(f'Accuracy Score {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score {metrics.recall_score(y_test, y_pred)}')

F1 Score 0.9155107187894074
Accuracy Score 0.917123166637215
Precision Score 0.9300878477306003
Recall Score 0.9013834693153601


In [18]:
xgb_forest = xgb.XGBRFClassifier(use_label_encoder=False, n_estimators=500, n_jobs=4, max_depth=9, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, missing=-999)

xgb_forest.fit(x_train, y_train)
y_pred = xgb_forest.predict(x_test)
print(f'F1 Score {metrics.f1_score(y_test, y_pred)}')
print(f'Accuracy Score {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score {metrics.recall_score(y_test, y_pred)}')



0.8708325742393878

F1 Score 0.859275441470963
Accuracy Score 0.8634034281675208
Precision Score 0.8825729244577412
Recall Score 0.8371763036537779


In [40]:
clss = ensemble.AdaBoostClassifier(n_estimators=300)
xgb_forest = xgb.XGBRFClassifier(use_label_encoder=False, n_estimators=500, n_jobs=4, max_depth=9, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, missing=-999)
classifier = xgb.XGBClassifier(use_label_encoder=False, n_estimators=500, n_jobs=4, max_depth=9, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, missing=-999)
forest = ensemble.RandomForestClassifier()
decision = tree.DecisionTreeClassifier()

final_estimator = ensemble.GradientBoostingRegressor(n_estimators=25, subsample=0.5, min_samples_leaf=25, max_features=1,
random_state=42)

estimators=[clss, xgb_forest, forest, decision, classifier]

staked = ensemble.StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [41]:
staked.fit(x_train, y_train)
y_pred = staked.predict(x_test)
print(f'F1 Score {metrics.f1_score(y_test, y_pred)}')
print(f'Accuracy Score {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score {metrics.recall_score(y_test, y_pred)}')

AttributeError: 'AdaBoostClassifier' object has no attribute 'estimators_'