In [1]:
import pandas as pd
from creditcard_mod import helpers
# ignorer les FuturWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
df = helpers.load_csv("../datasets/data.csv")
df.head(3)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,1093826151,4,M,'28007',348934600,'28007',transportation,4.55,0
1,0,352968107,2,M,'28007',348934600,'28007',transportation,39.68,0
2,0,2054744914,4,F,'28007',1823072687,'28007',transportation,26.89,0


In [2]:
df['step'] = 1577836800 + df['step'] * 3600 * 24
df['step'] = pd.to_datetime(df['step'], unit='s')

In [None]:
df.step

In [3]:
compute_features = {
    "nbre_trans_7jrs": 7,
    "nbre_trans_15jrs": 15,
    "nbre_trans_30jrs": 30   
}
compute_features_cm = {
    "count_cust_merch_1_day":1,
    "count_cust_merch_7_day":7,
    "count_cust_merch_15_day":15,
    "count_cust_merch_1_day":30
}

In [7]:
def create_transaction_customer_historic(data):
    for key, value in compute_features.items():
        temp = pd.Series(data.index, index = data.step, name=key).sort_index()
        count_day = temp.rolling(str(value)+'d').count() -1
        count_day.index = temp.values
        data[key] = count_day.reindex(data.index)
    return data
    

In [8]:
def create_transaction_customer_merchant_historic(data):
    for key, value in compute_features_cm.items():
        temp = pd.Series(data.index, index = data.step, name=key).sort_index()
        count_day = temp.rolling(str(value)+'d').count() - 1
        count_day.index = temp.values
        data[key] = count_day.reindex(data.index)
    return data
    

In [9]:
data_train = df.groupby(['customer','merchant']).apply(create_transaction_customer_merchant_historic)

In [14]:
data_train = data_train.groupby('customer').apply(create_transaction_customer_historic)

In [18]:
data_train = data_train.drop(['customer','merchant','zipcodeOri','zipMerchant','step'],axis=1)

In [22]:
from sklearn.preprocessing import LabelEncoder

In [24]:
cat_cols = ['age', 'gender', 'category']
enc = LabelEncoder()
for col in cat_cols:
    data_train[col] = enc.fit_transform(data_train[col])

In [25]:
data_train.head()

Unnamed: 0,age,gender,category,amount,fraud,count_cust_merch_1_day,count_cust_merch_7_day,count_cust_merch_15_day,nbre_trans_7jrs,nbre_trans_15jrs,nbre_trans_30jrs
0,4,2,12,4.55,0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2,12,39.68,0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,1,12,26.89,0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2,12,17.25,0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2,12,35.72,0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
helpers.save_csv(data_train,'../datasets/first_features.csv')

In [2]:
data_train = helpers.load_csv("../datasets/first_features.csv")

In [3]:
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, classification_report
# ignorer les FuturWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
y = data_train['fraud']
X = data_train.drop('fraud',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 1)

weights = (y == 0).sum() / (y == 1).sum()
clf = XGBClassifier(max_depth=3,scale_pos_weights=weights,n_jobs=4)
clf.fit(X_train,y_train)

print('AUPRC = {}'.format(average_precision_score(y_test, clf.predict_proba(X_test)[:,1])))



Parameters: { scale_pos_weights } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


AUPRC = 0.8755871944638264


In [6]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    117428
           1       0.92      0.70      0.79      1501

    accuracy                           1.00    118929
   macro avg       0.96      0.85      0.90    118929
weighted avg       1.00      1.00      1.00    118929



In [5]:
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# Import required libraries for performance metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}
models_scores_table = pd.DataFrame(index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
for model in [
    DummyClassifier,
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier
]:
    cls = model()
    kfold = model_selection.KFold(n_splits=2, random_state=1, shuffle=True)
    cvs = model_selection.cross_validate(
        cls, X_train, y_train, scoring=scoring, cv=kfold
    )
    models_scores_table[model.__name__]=[cvs['test_accuracy'].mean(),
                                       cvs['test_precision'].mean(),
                                       cvs['test_recall'].mean(),
                                       cvs['test_f1_score'].mean()]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
models_scores_table.head(5)

Unnamed: 0,DummyClassifier,LogisticRegression,DecisionTreeClassifier,KNeighborsClassifier,RandomForestClassifier,Best Score
Accuracy,0.98802,0.993645,0.992371,0.994188,0.995249,RandomForestClassifier
Precision,0.0,0.879822,0.67426,0.869247,0.893289,RandomForestClassifier
Recall,0.0,0.543729,0.702535,0.605936,0.685439,DecisionTreeClassifier
F1 Score,0.0,0.671653,0.688106,0.713936,0.775521,RandomForestClassifier


In [11]:
# Import required libraries for machine learning classifiers
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
# Import required libraries for performance metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

def models_evaluation(X, y, folds):
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation folds
    
    '''
    # Define dictionary with performance metrics
    scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}
    models_scores_table = pd.DataFrame(index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    for model in [
        DummyClassifier,
        LogisticRegression,
        DecisionTreeClassifier,
        KNeighborsClassifier,
        RandomForestClassifier,
        XGBClassifier
    ]:
        cls = model()
        kfold = model_selection.KFold(n_splits=folds, random_state=1, shuffle=True)
        cvs = model_selection.cross_validate(
            cls, X_train, y_train, scoring=scoring, cv=kfold
        )
        models_scores_table[model.__name__]=[cvs['test_accuracy'].mean(),
                                           cvs['test_precision'].mean(),
                                           cvs['test_recall'].mean(),
                                           cvs['test_f1_score'].mean()]
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)


In [12]:
models_evaluation(X_train, y_train, 7)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative 

Unnamed: 0,DummyClassifier,LogisticRegression,DecisionTreeClassifier,KNeighborsClassifier,RandomForestClassifier,Best Score
Accuracy,0.98802,0.993599,0.99242,0.994366,0.995329,RandomForestClassifier
Precision,0.0,0.876878,0.676916,0.878669,0.894272,RandomForestClassifier
Recall,0.0,0.541822,0.703317,0.614879,0.692071,DecisionTreeClassifier
F1 Score,0.0,0.66945,0.689716,0.723175,0.780155,RandomForestClassifier
