In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support,classification_report, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from sklearn.linear_model import Ridge
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler

In [2]:
class_names = ['suspicious', 'non-suspicious']

#These are the functions I will use for result visualisation
def plot_confusion_matrix(y_test,y_predicted,labels):
    cm = confusion_matrix(y_test, y_predicted.round())

    figsize = (10,7)
    df_cm = pd.DataFrame(
        cm, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    return

def evaluate_classifier(pipeline, x_test, y_test):

    y_predicted = pipeline.predict(x_test)

    report = classification_report(y_test, np.round(y_predicted))
    print(report)
    plot_confusion_matrix(y_test, y_predicted, class_names)
    return

In [3]:
train_data = pd.read_csv('data/train.csv', index_col='customer')
test_data = pd.read_csv('data/test.csv', index_col='customer')

In [4]:
train_dummies = pd.get_dummies(train_data, columns=['category', 'is_pep', 'nationality'])
test_dummies = pd.get_dummies(test_data, columns=['category', 'is_pep', 'nationality'])

In [5]:
x_train = train_dummies.drop(['suspicious'], axis=1)
y_train = pd.DataFrame(train_dummies['suspicious'])
x_test = test_dummies.copy()
x_test['nationality_117'] = 0

In [None]:
x_train.head()

In [None]:
pipeline = Pipeline([
    ('normalizer', Normalizer()), 
    ('random_forest', RandomForestClassifier(n_estimators=1000, criterion='entropy',  max_depth=10, random_state=0, n_jobs=-1, 
                                            class_weight='balanced'))
])

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)

In [None]:
evaluate_classifier(pipeline, x_train, y_train)

In [None]:
train_dummies_suspicious = train_dummies[train_dummies['suspicious'] == 1]
x_train_suspicious = train_dummies_suspicious.drop(['suspicious'], axis=1)
y_train_suspicious = pd.DataFrame(train_dummies_suspicious['suspicious'])

train_dummies_non_suspicious = train_dummies[train_dummies['suspicious'] == 0]
x_train_non_suspicious = train_dummies_non_suspicious.drop(['suspicious'], axis=1)
y_train_non_suspicious = pd.DataFrame(train_dummies_non_suspicious['suspicious'])

In [None]:
np.random.choice(x_train_non_suspicious.shape[0], x_train_suspicious.shape[0])
x_train_non_suspicious[indexes_selected_non_suspicious]

In [None]:
for i in range(10):
    indexes_selected_non_suspicious = np.random.choice(x_train_non_suspicious.shape[0], x_train_suspicious.shape[0])
    x_selected_non_suspicious = x_train_non_suspicious.values[indexes_selected_non_suspicious]
    y_selected_non_suspicious = y_train_non_suspicious.values[indexes_selected_non_suspicious]
    x_total = np.vstack((x_selected_non_suspicious, x_train_suspicious))
    y_total = np.vstack((y_selected_non_suspicious, y_train_suspicious))
    evaluate_classifier(pipeline, x_total, y_total)

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_classes=2, class_sep=1.5, weights=[0.9, 0.1],
    n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1,
    n_samples=100, random_state=10
)

df = pd.DataFrame(X)
df['target'] = y
df.target.value_counts().plot(kind='bar', title='Count (target)');

In [None]:
for i in range(5):
    indices = np.random.choice(x_train.shape[0], int(x_train.shape[0]*0.1))
    ids = x_train.index[indices]

    x_train_train = x_train.values[indices]
    x_train_validation = x_train.drop(ids)
    y_train_train = y_train.values[indices]
    y_train_validation = y_train.drop(ids)

    rus = RandomUnderSampler(return_indices=True)
    X_rus, y_rus, id_rus = rus.fit_sample(x_train_train, y_train_train)

    pipeline.fit(X_rus, y_rus)
    accs.append(evaluate_classifier(pipeline, x_train_validation, y_train_validation))


In [None]:
predicted_labels = pipeline.predict_proba(x_test)

data_set_with_label = x_test.copy(deep=True)

data_set_with_label['label'] = predicted_labels[:,1]

data_set_with_label = data_set_with_label.sort_values(by='label',ascending=False)

aaa = data_set_with_label.head(1000)

aaa.index.to_frame().to_csv("res.csv",index=False)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import math
from tqdm import tnrange, tqdm_notebook

In [30]:
# D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
batch_size, D_in, H, D_out = 64, x_train.shape[1], 100, 1
#la première ligne qui contient le numéro de frame ne nous intéresse pas dans le NN

#Define the model sequentially
m = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, H),
    torch.nn.ReLU(), 
    torch.nn.Linear(H, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

lambda_ = 1e-4
epochs = 1000
loss = torch.nn.BCELoss()
batch_size = 64

In [31]:
def train_NNmodel_batch(model, xs, ys, learning_rate, loss_fn, number_of_epochs, batch_size):
    #to run on GPU
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in tqdm_notebook(range(number_of_epochs), desc='Epoch', leave=False):
        
        #get number of random indexes equal to batch_size
        indexes_batch = np.random.choice(xs.shape[0], batch_size, replace=False)
        #get the rows that correspond to these 
        x_selected = xs[indexes_batch]

        #give directly the matrix as input to the model
        x = torch.tensor(x_selected, dtype = torch.float, device=device)         
        
        y_temp = ys[indexes_batch]
        y = torch.tensor(y_temp, dtype=torch.float, device = device)
        
        # Forward pass: compute predicted y by passing x to the model.
        y_pred = model(x)

        # Compute and print loss.
        loss = loss_fn(y_pred, y)
        #print('Epoch: ', epoch, ' Column: ', i, ' Loss: ', loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [32]:
train_NNmodel_batch(m, x_train.values, y_train.values, lambda_, loss, epochs, batch_size)

HBox(children=(IntProgress(value=0, description='Epoch', max=1000, style=ProgressStyle(description_width='init…

RuntimeError: cuda runtime error (2) : out of memory at C:/ProgramData/Miniconda3/conda-bld/pytorch_1533090623466/work/aten/src/THC/THCTensorCopy.cu:206

In [None]:
def predict_labels(model, validation):
    predicted_labels = pipeline.predict_proba(x_test)

    data_set_with_label = x_test.copy(deep=True)

    data_set_with_label['label'] = predicted_labels[:,1]

    data_set_with_label = data_set_with_label.sort_values(by='label',ascending=False)
    aaa = data_set_with_label.head(1000)
    aaa.index.to_frame().to_csv("res.csv",index=False)
    
    y = model(validation)

In [29]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

y_pred = m(torch.tensor(x_test.values, dtype=torch.float, device=device))

RuntimeError: CUDA error: out of memory