In [1]:
### Dependencies ###
import fireducks.pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score

In [2]:
### Train Test Split ###
fraud_df = pd.read_csv("data/fraud_df_mv_b_nc.csv")
X = fraud_df.drop('fraud_bool',axis=1)
y = fraud_df['fraud_bool']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [3]:
### Viewing results ###
# X.head()
# y_test

175140     0
363742     0
509826     0
225305     0
808329     0
          ..
1048334    1
1667681    1
699098     0
1904411    1
468400     0
Name: fraud_bool, Length: 395589, dtype: int64

In [3]:
### Feature Engineering ###

categorical = [col for col in X_train.columns if X_train[col].dtypes == 'object']
numerical = [col for col in X_train.columns if X_train[col].dtypes != 'object']

In [4]:
### Encoding categorical variables ###
# anonimized data => nominal categorical variables, not that much dimensions => one-hot-encoding
encoder = OneHotEncoder(sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical])
encoder_col_names = encoder.get_feature_names_out(categorical)
X_test_cat = encoder.transform(X_test[categorical])

In [5]:
### Scaling Data ###
# Regularization based algo => scaling to help efficient convergence
scaler = MinMaxScaler()
X_train_num = scaler.fit_transform(X_train[numerical])
X_test_num = scaler.transform(X_test[numerical])

In [6]:
### Regrouping data ###

X_train = pd.concat([pd.DataFrame(X_train_cat,columns=encoder_col_names),pd.DataFrame(X_train_num,columns=numerical)],axis=1)
X_test = pd.concat([pd.DataFrame(X_test_cat,columns=encoder_col_names),pd.DataFrame(X_test_num,columns=numerical)],axis=1)

In [7]:
### Model Training ###

logreg_liblinear = LogisticRegression(solver='liblinear', random_state=42) # small dataset : liblinear might be a good choice ... more than 30 seconds
logreg_cholesky = LogisticRegression(solver='newton-cholesky', random_state=42) # good results and around 10 secs
logreg_lbfgs = LogisticRegression(solver='lbfgs', random_state=42) # same results around 14 secs
logreg_cg = LogisticRegression(solver='newton-cg', random_state=42) # kernel always crashes ...
logreg_sag = LogisticRegression(solver='sag', random_state=42) # more than 30 seconds


In [9]:
### Basic Classifier to Compare to : One that always say True => 50% accuracy ###
# y.value_counts()

fraud_bool
0    988971
1    988971
Name: count, dtype: int64

In [8]:
### Predicting results ###
def model_evaluation(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    print('Test-set Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))
    print('Training-set Model accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [9]:
### Newton-Cholesky Solver ###
model_evaluation(logreg_cholesky,X_train,y_train,X_test,y_test) # Much quicker with same performance (logic according to sklearn documentation)

Test-set Model accuracy score: 0.7950
Training-set Model accuracy score: 0.7950


In [12]:
### Cross Validation ###
# Test set around 400K rows, let's use folders with app the same dimensions (cf scenario replication)
# logreg_cv_cholesky = LogisticRegressionCV(cv=4,solver='newton-cholesky',random_state=42)
# logreg_cv_cholesky.fit(X_train,y_train) # always crashes ... no amelioration possible with the current calculation power