In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer # Imputing missing values
from imblearn.under_sampling import RandomUnderSampler # Class Imbalance

# Numerical features selection
from sklearn.decomposition import PCA 

# Categorical Features Selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

# Classification Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Model validation
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Model Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn import metrics

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

# Other General Imports
import gc
%matplotlib inline
import time
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

# Data Loading and Reducing the Size

Since the data is big in size, we will use function to reduce its memory for fast processing and consuming less storage.

In [2]:
start = time.time()
# Helper function
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# loading train_transaction data
tt = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
print(tt.shape)
tt = reduce_mem_usage(tt)

# loading train_transaction data
ti = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
print(ti.shape)
ti = reduce_mem_usage(ti)

train = pd.merge(tt, ti, how = 'left')
print('Train shape',train.shape)

train.head()

del tt, ti

In [None]:
# loading test_transaction data
ts = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
print(ts.shape)
ts = reduce_mem_usage(ts)

tsi = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
print(tsi.shape)
tsi = reduce_mem_usage(tsi)

test = pd.merge(ts, tsi, how = 'left')
print('Test shape',test.shape)
test.head()

del ts, tsi

# Data Preparation

In [None]:
def summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values

    return summary

In [None]:
summary(train)

In [None]:
summary(test)

## Feature Engineering 1

### Handling and Genearating Features

In [3]:
def Devices(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]
    df = df.drop(['DeviceInfo'], axis = 1)
    
    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].str.split(' ', expand=True)[1]
    df = df.drop(['id_30'], axis = 1)
    
    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]
    df = df.drop(['id_31'], axis = 1)

    df['screen_width'] = df['id_33'].str.split('x', expand=True)[0]
    df['screen_height'] = df['id_33'].str.split('x', expand=True)[1]
    df = df.drop(['id_33'], axis = 1)

    df['id_34'] = df['id_34'].str.split(':', expand=True)[1]
    df['id_23'] = df['id_23'].str.split(':', expand=True)[1]
    df = df.drop(['id_34', 'id_23'], axis = 1)


    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df



In [None]:
train = Devices(train)

In [4]:
def DevicesT(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]
    df = df.drop(['DeviceInfo'], axis = 1)
    
    df['OS_id_30'] = df['id-30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id-30'].str.split(' ', expand=True)[1]
    df = df.drop(['id-30'], axis = 1)
    
    df['browser_id_31'] = df['id-31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id-31'].str.split(' ', expand=True)[1]
    df = df.drop(['id-31'], axis = 1)

    df['screen_width'] = df['id-33'].str.split('x', expand=True)[0]
    df['screen_height'] = df['id-33'].str.split('x', expand=True)[1]
    df = df.drop(['id-33'], axis = 1)

    df['id-34'] = df['id-34'].str.split(':', expand=True)[1]
    df['id-23'] = df['id-23'].str.split(':', expand=True)[1]
    df = df.drop(['id-34', 'id-23'], axis = 1)


    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df




In [None]:
test = DevicesT(test)

## Dealing with Missing Data

In [None]:
y = train['isFraud']
TrainTransactionID = train['TransactionID']
TrainTransactionDT = train['TransactionDT']

In [None]:
TestTransactionID = test['TransactionID']
TestTransactionDT = test['TransactionDT']

In [None]:
train = train.drop(['TransactionID', 'TransactionDT' , 'isFraud'], axis = 1)

In [None]:
test = test.drop(['TransactionID', 'TransactionDT'], axis = 1)

In [None]:
# Dropping columns with more than 80% missing values 
print("Train shape before dropping features more than 30% missing values: ", train.shape)
mv = train.isnull().sum()/len(train)
train = train.drop(columns=mv[mv>0.7].index)

print("Train shape after dropping features more than 30% missing values: ", train.shape)

In [None]:
print("Test shape before dropping features more than 30% missing values: ", test.shape)
mv = test.isnull().sum()/len(test)
test = test.drop(columns=mv[mv>0.7].index)

print("Test shape after dropping features more than 30% missing values: ", test.shape)

In [None]:
train.dropna(how= 'all', axis = 0)

In [None]:
test.dropna(how= 'all', axis = 0)

In [None]:
# Filtering numerical data
num_df = train.select_dtypes(include=np.number)
print(num_df.shape)

# Filtering categorical data
cat_df = train.select_dtypes(exclude=np.number)
print(cat_df.shape)

In [None]:
# Filtering numerical data
num_df_test = test.select_dtypes(include=np.number)
print(num_df.shape)

# Filtering categorical data
cat_df_test = test.select_dtypes(exclude=np.number)
print(cat_df.shape)

In [None]:
del train
del test
gc.collect()

In [None]:
# Filling missing values by median for numerical columns 
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
num_df = pd.DataFrame(imp_median.fit_transform(num_df), columns=num_df.columns)
print(num_df.shape)

# Filling missing values by most frequent value for categorical columns
for i in cat_df:
    cat_df[i] = cat_df[i].fillna(np.NaN)
    
cat_df.isnull().sum()
num_df.isnull().sum()

In [None]:
# Filling missing values by median for numerical columns 
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
num_df_test = pd.DataFrame(imp_median.fit_transform(num_df_test), columns=num_df_test.columns)
print(num_df_test.shape)

# Filling missing values by most frequent value for categorical columns
for i in cat_df_test:
    cat_df_test[i] = cat_df_test[i].fillna(np.NaN)
    
cat_df_test.isnull().sum()
num_df_test.isnull().sum()

## Numerical Features

In [None]:
scaled_num = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(num_df), columns = num_df.columns)
scaled_num.head()

In [None]:
scaled_num_test = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(num_df_test), columns = num_df_test.columns)
scaled_num_test.head()

### PCA for Numerical Features

In [None]:
#optimuadd_suffixm number of components
pca = PCA().fit(scaled_num)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("number of components")
plt.ylabel("Cumulative Rate of Variance")

#final
pca = PCA(n_components = 0.99)

pca_fit = pca.fit_transform(scaled_num)
num_pca = pd.DataFrame(data = pca_fit)
pca.explained_variance_ratio_.sum()
del num_df, scaled_num

In [None]:
num_pca.shape

In [None]:
#optimuadd_suffixm number of components
pca_test = PCA().fit(scaled_num_test)
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel("number of components")
plt.ylabel("Cumulative Rate of Variance")

#final
pca_test = PCA(n_components = 68)

pca_fit_test = pca_test.fit_transform(scaled_num_test)
num_pca_test = pd.DataFrame(data = pca_fit_test)
pca_test.explained_variance_ratio_.sum()
del num_df_test, scaled_num_test

In [None]:
num_pca_test.shape

## Categorical Features

Due to we have too many values in some categories, we try to map them into new categories.

In [None]:
for x in cat_df.columns:
    #printing unique values
    print(x ,':', len(cat_df[x].unique()))

In [None]:
cat_df = pd.get_dummies(cat_df)
cat_df.head()

In [None]:
cat_df_test = pd.get_dummies(cat_df_test)
cat_df_test.head()

# Tree-based Categorical Feature Selection

In [None]:
clf = ExtraTreesClassifier(n_estimators=200, criterion = 'entropy')
clf = clf.fit(cat_df, y)

model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = cat_df.columns[feature_idx]

cat_new = pd.DataFrame(model.transform(cat_df), columns = feature_name)
cat_new.head()

In [None]:
cat_new_test = pd.DataFrame(cat_df_test, columns = feature_name)
cat_new_test.head()

### Concatenating Numerical and Categorical Features 

In [None]:
# Concatinating numerical and categorical data
train = pd.concat([y, num_pca, cat_df], axis=1)
train = pd.DataFrame(train)

# Verifying missing values
print(f'Total missing values: {train.isnull().sum().sum()}')
print(train.shape)
train.head()

In [None]:
del  num_pca, cat_df, cat_new, y
gc.collect()

In [None]:
# Concatinating numerical and categorical data
test = pd.concat([num_pca_test, cat_df_test], axis=1)
test = pd.DataFrame(test)

# Verifying missing values
print(f'Total missing values: {test.isnull().sum().sum()}')
print(test.shape)
test.head()

In [None]:
del  cat_df_test, num_pca_test, cat_new_test
gc.collect()

In [None]:
train = reduce_mem_usage(train)

In [None]:
train.head()

In [None]:
test = reduce_mem_usage(test)

In [None]:
test.shape

In [None]:
test.head()

In [None]:
gc.collect()

In [None]:
y = train['isFraud']
X = train.drop(['isFraud'], axis = 1)
X_array=np.array(X)

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

print(X_resampled.shape, y_resampled.shape)

pd.value_counts(y_resampled)

In [None]:
X_df = pd.DataFrame(X_resampled, columns = X.columns)
y_df = pd.DataFrame(y_resampled)
df = pd.concat([X_df, y_df], axis= 1)
df.shape 
df.head()

In [None]:
X = df.drop(["isFraud"], axis = 1)
y = df["isFraud"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
gc.collect()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

space = {
    "bootstrap": hp.choice("bootstrap", [False]),
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000]),
    "max_depth": hp.choice("max_depth", [10, 20, 30, 40, 50, 100, 200, 250, 300, 350, 400]),
    "criterion": hp.choice("criterion", ["entropy"]),
    "max_features": hp.choice("max_features", ['sqrt', 'auto']),
    "min_samples_leaf": hp.choice ("min_samples_leaf", [1, 3, 5, 7, 9]),
    "min_samples_split": hp.choice("min_samples_split", [2, 5, 8, 10, 12, 15, 20])
    
}

def hyperparameter_tuning(params):
    rf = RandomForestClassifier(**params,n_jobs=-1, verbose = 0)
    rf.fit(X_train, y_train)
    y_scores = rf.predict_proba(X_test)
    roc= metrics.roc_auc_score(y_test, y_scores[:,1])
    return {"loss": -roc, "status": STATUS_OK}


trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=5, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
rf = RandomForestClassifier(bootstrap = False,
                            criterion = "entropy",
                            max_depth= 350,
                            n_estimators= 1600,
                            min_samples_split = 12,
                            min_samples_leaf = 3,
                            max_features= 'sqrt')


rf.fit(X_train, y_train)
y_scores = rf.predict_proba(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_scores[:,1])
RFScore = metrics.auc(fpr, tpr)
print(RFScore)


# plot ROC curve
fig = plt.figure(figsize=(6, 6))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve- Random Forest')
plt.show()

In [None]:
y_scores = rf.predict_proba(test)

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

sub =column(y_scores, 1)

In [None]:
result = pd.DataFrame(sub, columns = ['isFraud'])
transactionID = pd.DataFrame(TestTransactionID)

In [None]:
sub = pd.concat([transactionID, result], axis =1)

In [None]:
sub.head()

In [None]:
sub.to_csv('sub.csv', index=False)

In [5]:
# loading train_transaction data
tt = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
print(tt.shape)
tt = reduce_mem_usage(tt)

# loading train_transaction data
ti = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
print(ti.shape)
ti = reduce_mem_usage(ti)

train = pd.merge(tt, ti, how = 'left')
print('Train shape',train.shape)

train.head()


del tt, ti

# loading test_transaction data
ts = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
print(ts.shape)
ts = reduce_mem_usage(ts)

tsi = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
print(tsi.shape)
tsi = reduce_mem_usage(tsi)

test = pd.merge(ts, tsi, how = 'left')
print('Test shape',test.shape)
test.head()

del ts, tsi

(590540, 394)
Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 487.16 MB
Decreased by 72.6%
(144233, 41)
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.00 MB
Decreased by 77.8%
Train shape (590540, 434)
(506691, 393)
Memory usage of dataframe is 1519.24 MB
Memory usage after optimization is: 425.24 MB
Decreased by 72.0%
(141907, 41)
Memory usage of dataframe is 44.39 MB
Memory usage after optimization is: 9.84 MB
Decreased by 77.8%
Test shape (506691, 433)


In [6]:
def ColumnsDropper(df):
    mv = df.isnull().sum()/len(df)
    df = df.drop(columns=mv[mv>0.8].index)
    return df

train = Devices(train)
train = ColumnsDropper(train)
test = DevicesT(test)
test = ColumnsDropper(test)

In [7]:
y, TransactionID, TransactionDT = train['isFraud'], train['TransactionID'], train['TransactionDT']
X = train.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
num_df = X_train.select_dtypes(include=np.number).columns
# Numerical Preprocessing
num_pre = Pipeline(steps =
                   [("Num Imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
                    ("Scaler", preprocessing.MinMaxScaler()),
                    ("PCA", PCA(n_components = 0.95))])


cat_df = X_train.select_dtypes(exclude=np.number).columns
# Categorical Preprocessing
cat_pre = Pipeline(steps =
                   [("Cat Imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                    ("onehot", OneHotEncoder(handle_unknown='ignore')),
                    ("Categorical_Selector", SelectFromModel(ExtraTreesClassifier(n_estimators=100,
                                                                       criterion = 'entropy')))])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pre, num_df),
        ('cat', cat_pre, cat_df)])

In [10]:
#from imblearn.pipeline import Pipeline
pipe = Pipeline(steps = 
                [('preprocessor', preprocessor),
                 #('UnderSampling', RandomUnderSampler(random_state=42)),
                 ('RandomForest', RandomForestClassifier())])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('Num '
                                                                   'Imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('Scaler',
                                                                   MinMaxScaler()),
                                                                  ('PCA',
                                                                   PCA(n_components=0.95))]),
                                                  Index(['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2',
       'dist1', 'C1', 'C2',
       ...
       'id_01', 'id_02', 'id_05', 'id_06', 'id_11', 'id_13', 'id_17', 'id_1...
                                                                   SelectFromMod

In [11]:
y_hat = pipe.predict(X_test)
y_scores = pipe.predict_proba(X_test)
print(metrics.confusion_matrix(y_hat, y_test))
print(metrics.accuracy_score(y_test, y_hat))
print(metrics.precision_score(y_test, y_hat, average='weighted'))
print(metrics.recall_score(y_test, y_hat, average='weighted'))
print(metrics.f1_score(y_test, y_hat, average='weighted'))
print(metrics.roc_auc_score(y_test, y_scores[:,1]))

[[142220   3434]
 [   110   1871]]
0.9759948521692011
0.975275473804281
0.9759948521692011
0.9706576064691285
0.9092129519661766


In [13]:
y_scores = pipe.predict_proba(test)

KeyError: "['id_01', 'id_02', 'id_05', 'id_06', 'id_11', 'id_13', 'id_17', 'id_19', 'id_20'] not in index"

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

sub =column(y_scores, 1)

In [None]:
result = pd.DataFrame(sub, columns = ['isFraud'])
transactionID = pd.DataFrame(TestTransactionID)

In [None]:
sub = pd.concat([transactionID, result], axis =1)
sub.to_csv('sub.csv', index=False)