# Libraries import

In [1]:
import pickle
import os
from functions.cleaning import cleaning_lr

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model  import LogisticRegression

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import of clean data

In [3]:
df = cleaning_lr()

# Partitioning

In [4]:
X = df.drop(columns=['EVENT_LABEL'])
y = df['EVENT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [5]:
path_X_train = os.getcwd() + '/data/X_train.csv'
X_train.to_csv(path_X_train, index=False)

path_y_train = os.getcwd() + '/data/y_train.csv'
y_train.to_csv(path_y_train, index=False)

In [6]:
path_X_test = os.getcwd() + '/data/X_test.csv'
X_test.to_csv(path_X_test, index=False)

path_y_test = os.getcwd() + '/data/y_test.csv'
y_test.to_csv(path_y_test, index=False)

In [7]:
df.head()

Unnamed: 0,transaction_amt,transaction_adj_amt,historic_velocity,currency,cvv,signature_image,transaction_type,transaction_env,tranaction_initiate,inital_amount,EVENT_LABEL,day,month,browser,os,acc_age,d_last_logon
0,2167.0,56.0,2572.0,cad,D,F,U,X,O,13646.0,0,Tue,Dec,Opera,Windows,4,3
1,2045.0,48.0,4517.0,cad,X,X,H,W,J,11930.0,0,Tue,Jun,Opera,Linux,3,1
2,2892.0,61.0,5007.0,cad,X,Q,X,X,T,7412.0,0,Mon,May,Mozilla,Linux,7,5
4,2976.0,66.0,2600.0,cad,X,F,F,G,K,1905.0,0,Sat,Mar,Mozilla,Linux,4,2
5,1854.0,50.0,4539.0,eur,X,C,S,D,I,1714.0,0,Thu,Jan,Mozilla,iPhone,3,3


# Logistic Regression

## Under sampling

### Preprocess

In [8]:
num_labels_u_lr = X_train.select_dtypes('number').columns
cat_labels_u_lr = X_train.select_dtypes('object').columns

under_sampler_u_lr = RandomUnderSampler()
num_preprocess_u_lr = StandardScaler()
cat_preprocess_u_lr = OneHotEncoder(drop='first', handle_unknown='ignore')
preprocessor_u_lr = ColumnTransformer([('cat', cat_preprocess_u_lr, cat_labels_u_lr),
                                  ('num', num_preprocess_u_lr, num_labels_u_lr)])

### Model

In [9]:
LogReg_u_lr = LogisticRegression(solver='saga', max_iter=100000)

### Pipeline

In [10]:
u_lr = Pipeline([('under_sampler', under_sampler_u_lr),
                 ('pre', preprocessor_u_lr),
                 ('model', LogReg_u_lr)])
u_lr.fit(X_train, y_train)

## Over sampling

### Preprocess

In [11]:
num_labels_o_lr = X_train.select_dtypes('number').columns
cat_labels_o_lr = X_train.select_dtypes('object').columns

over_sampler_o_lr = RandomOverSampler(sampling_strategy='minority')
num_preprocess_o_lr = StandardScaler()
cat_preprocess_o_lr = OneHotEncoder(drop='first')
preprocessor_o_lr = ColumnTransformer([('cat', cat_preprocess_o_lr, cat_labels_o_lr),
                                  ('num', num_preprocess_o_lr, num_labels_o_lr)])

### Model

In [12]:
LogReg_o_lr = LogisticRegression(solver='lbfgs', max_iter=100000)

### Pipeline

In [13]:
o_lr = Pipeline([('over_sampler', over_sampler_o_lr),
                 ('pre', preprocessor_o_lr),
                 ('model', LogReg_o_lr)])
o_lr.fit(X_train, y_train)

# Export

In [14]:
with open('models/under_lr.pkl', 'wb') as f:
    pickle.dump(u_lr, f)
f.close()

In [15]:
with open('models/over_lr.pkl', 'wb') as f:
    pickle.dump(o_lr, f)
f.close()