In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support,classification_report, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

In [2]:
class_names = ['suspicious','non suspicious']

In [3]:
#These are the functions I will use for result visualisation
def plot_confusion_matrix(y_test,y_predicted,labels):
    cm = confusion_matrix(y_test, y_predicted,labels =labels)

    figsize = (10,7)
    df_cm = pd.DataFrame(
        cm, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    return

def evaluate_classifier(pipeline, x_test, y_test):

    y_predicted = pipeline.predict(x_test)

    report  = classification_report(y_test,y_predicted)
    print(report)
    plot_confusion_matrix(y_test, y_predicted, class_names)
    return

In [4]:
data = pd.read_csv('./data/train.csv',index_col='customer',encoding='utf8')
data_test = pd.read_csv('./data/test.csv',index_col='customer',encoding='utf8')

In [5]:
#cols_to_transform = ['is_pep','category']
#data = pd.get_dummies(data=data, columns = cols_to_transform)
data['suspicious'] = data['suspicious'].map({ 1 : 'suspicious', 0: 'non suspicious'})

In [6]:
y = data.suspicious
x = data.drop('suspicious',axis=1)
x.drop('nationality', axis=1, inplace=True)
x.drop('category', axis=1, inplace=True)
x.drop('is_pep', axis=1, inplace=True)

In [78]:
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2, random_state= 1)

In [79]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=200, subsample=0.8)
rfc = RandomForestClassifier(n_estimators=50)
clf_bag = BaggingClassifier(base_estimator = gbc, n_estimators=50)
clf_bag.fit(train_x, train_y)

** TO PRODUCE THE PREDICTION **

In [57]:
data_test.drop('nationality', axis=1, inplace=True)
data_test.drop('category', axis=1, inplace=True)
data_test.drop('is_pep', axis=1, inplace=True)

In [59]:
predicted_labels = clf_bag.predict_proba(data_test)
data_set_with_label = data_test.copy(deep=True)
data_set_with_label['label'] = predicted_labels[:,1]
data_set_with_label = data_set_with_label.sort_values(by='label',ascending=False)
aaa = data_set_with_label.head(1000)
aaa.index.to_frame().to_csv("val.csv",index=False)

** PART 3: MINIMIZE THE COST **

In [75]:
def get_cost(param, *args):
    
    label = args[0][0]
    test_y = args[0][1]
    N = int(param)
    fraudulent = label.head(N)
    susp = test_y[test_y == 'suspicious']
    
    M = len(susp) - len(set(fraudulent.index) & set(susp.keys()))
    
    score = 1000*N+60000*M

    return score

In [76]:
y_predicted = clf_bag.predict_proba(test_x)
data_set_with_label = test_x.copy(deep=True)
data_set_with_label['label'] = y_predicted[:,1]
data_set_with_label = data_set_with_label.sort_values(by='label',ascending=False)

In [77]:
from scipy.optimize import minimize_scalar

minimize_scalar(get_cost, args = [data_set_with_label, test_y])

     fun: 183353000
    nfev: 30
     nit: 29
 success: True
       x: 52373.471332469984