In [None]:
#Loading libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.ensemble import RandomForestClassifier
import time
import xgboost as xgb
import warnings
import pickle

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use(style='seaborn')
%matplotlib inline

In [None]:
test_identity = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
train_identity = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
test_transaction = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
train_transaction = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")

In [None]:
#showing shape
print(train_identity.shape)
print(train_transaction.shape)

In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [None]:
print(train.info())
print(test.info())

In [None]:
def differentcolumns(traincols, testcols):
    for i in traincols:
        if i not in testcols:
            print(i)
differentcolumns(train.columns, test.columns)

In [None]:
test = test.rename(columns={"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})
test.head()

In [None]:
differentcolumns(train.columns, test.columns)

In [None]:
train.isnull().sum().sum()

In [None]:
def getnulls(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['total', 'precent'])
    return missing_data

missing_data_train = getnulls(train)
missing_data_train.head(350).T

In [None]:
droppedcols = missing_data_train[missing_data_train['total'] > 100000].index
droppedcols

In [None]:
train.drop(droppedcols, axis=1, inplace=True)
test.drop(droppedcols, axis=1, inplace=True)

In [None]:
print(train.info())
print(test.info())

In [None]:
print(train.shape)
print(test.shape)

In [None]:
missing_data_train = getnulls(train)
missing_data_train.head(200).T

In [None]:
missing_data_test = getnulls(test)
missing_data_test.head(200).T

In [None]:
train['P_emaildomain']

In [None]:
train['P_emaildomain'].value_counts()

In [None]:
cntgmail = 0
cntyahoo = 0
cnthotmail = 0
cntanon = 0
cntaol = 0
for i in range(train.shape[0]):
    if train['P_emaildomain'][i] == 'gmail.com' and train['isFraud'][i] == 1:
        cntgmail += 1
    elif train['P_emaildomain'][i] == 'yahoo.com' and train['isFraud'][i] == 1:
        cntyahoo += 1
    elif train['P_emaildomain'][i] == 'hotmail.com' and train['isFraud'][i] == 1:
        cnthotmail += 1
    elif train['P_emaildomain'][i] == 'anonymous.com' and train['isFraud'][i] == 1:
        cntanon += 1
    elif train['P_emaildomain'][i] == 'aol.com' and train['isFraud'][i] == 1:
        cntaol += 1
    
print("GMAIL:", cntgmail)
print("YAHOO:", cntyahoo)
print("HOTMAIL:", cnthotmail)
print("ANON:", cntanon)
print("AOL:", cntaol)

In [None]:
train.drop(['P_emaildomain'], axis=1, inplace=True)
test.drop(['P_emaildomain'], axis=1, inplace=True)

In [None]:
missing_data_train = getnulls(train)
print(missing_data_train.head(100).T)

missing_data_test = getnulls(test)
print(missing_data_test.head(100).T)

In [None]:
droppedcols = missing_data_train[missing_data_train['total'] > 15000].index

In [None]:
train.drop(droppedcols, axis=1, inplace=True)
test.drop(droppedcols, axis=1, inplace=True)
print(train.shape)
print(test.shape)

In [None]:
differentcolumns(train.columns, test.columns)

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat([train, test], axis=0, sort=False)
all_data.shape

In [None]:
all_data_cols = all_data.columns
for i in all_data_cols:
    if all_data[i].dtype == 'object':
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])

In [None]:
missing_data = getnulls(all_data)
missing_data.head(110).T

In [None]:
for i in all_data_cols:
    if (i.startswith("C") or (i.startswith("V"))) and all_data[i].isnull().sum() > 0:
        all_data[i] = all_data[i].fillna(all_data[i].mode()[0])
missing_data = getnulls(all_data)
missing_data.head(10).T

In [None]:
all_data['D1'] = all_data['D1'].fillna(all_data['D1'].mode()[0])
all_data['card3'] = all_data['card3'].fillna(all_data['card3'].mean())

In [None]:
all_data['card2'] = all_data['card2'].fillna(all_data['card2'].mean())

In [None]:
missing_data = getnulls(all_data)
missing_data.head(5).T

In [None]:
all_data['card5'] = all_data['card5'].fillna(all_data['card5'].mean())

In [None]:
missing_data = getnulls(all_data)
missing_data.head(5).T

In [None]:
print(ntrain)
print(ntest)
print(ntrain+ntest)
print(all_data.shape)

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
print(test['isFraud'].value_counts())
test.drop(['isFraud'], axis=1, inplace=True)

In [None]:
corr = train.corr()
plt.subplots(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [None]:
train.corr()['isFraud'].to_csv("corr.csv")

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat([train, test], axis=0, sort=False)
print(all_data.shape)

In [None]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)
all_data.head()

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]
train.drop(['TransactionID'], axis=1, inplace=True)
test_id = test['TransactionID']
test.drop(['TransactionID'], axis=1, inplace=True)
print(train.shape)
print(test.shape)

In [None]:
differentcolumns(train.columns, test.columns)

In [None]:
target = train.isFraud
train.drop(['isFraud'], axis=1, inplace=True)
test.drop(['isFraud'], axis=1, inplace=True)
print(train.shape)
print(test.shape)

In [None]:
%%time
xgmodel = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=50, 
                             min_child_weight=1.7817, n_estimators=200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)
xgmodel.fit(train, target)

In [None]:
y_pred_xg = xgmodel.predict_proba(test)

In [None]:
y_pred_xg

In [None]:
sub = pd.DataFrame()
sub['TransactionID'] = test_id
sub['isFraud'] = y_pred_xg[:, 1]
sub.to_csv('submission_xgb.csv', index=False)