In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
# 0. duomenys
path = "bank-full.csv"
df = pd.read_csv(path)

# kategorinių kintamųjų perkodavimas (supaprastintas)
label_encoding = {
    "y":{"no":0,"yes":1},
    "poutcome":{"unknown":0,"failure":0,"other":0,"success":1},
    "month":{"jan":0,"feb":1,"mar":2,"apr":3,"may":4,"jun":5,"jul":6,"aug":7,"sep":8,"oct":9,"nov":10,"dec":11},
    "contact":{"unknown":0,"cellular":1,"telephone":1},
    "loan":{"no":0,"yes":1},
    "housing":{"no":0,"yes":1},
    "default":{"no":0,"yes":1},
    "education":{"tertiary":1,"secondary":0,"unknown":0,"primary":0},
    "marital":{"married":0,"single":1,"divorced":1},   
    "job":{"management":0,"technician":1,"entrepreneur":2,"blue-collar":3,"unknown":4,"retired":5,"admin.":6,"services":7,"self-employed":8,"unemployed":9,"housemaid":10,"student":11}
}



df = df.replace(label_encoding)
df = df.drop("job", axis=1)

# kiekybinių kintamųjų transformavimas
num_features=['age', 'balance', 'day', 'duration','campaign', 'pdays', 'previous']

scaler = MinMaxScaler(feature_range=(0, 1))

df[num_features] = scaler.fit_transform(df[num_features]) 

# X, y
X, y = df.drop('y',axis=1).values , df['y'].values
X.shape, y.shape

# skaidymas į train-val ir test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# 1

In [2]:
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

def gm(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    recall = (tp)/(tp+fn)
    specificity = (tn)/(tn+fp)
    return (recall*specificity)**0.5

In [3]:
xt, xv, yt, yv = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0, stratify=y_train_val)

## a

In [4]:
for c in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    tempa = LogisticRegression(max_iter = 10000, C = c).fit(xt, yt)
    print("C=",c,": ", specificity(yv, tempa.predict(xv)))

C= 0.01 :  0.9945209768315592
C= 0.1 :  0.9873199749530369
C= 1 :  0.980745147150908
C= 10 :  0.9788666249217283
C= 100 :  0.9787100814026299
C= 1000 :  0.9787100814026299
C= 10000 :  0.9787100814026299


In [5]:
# Geriausias su C = 0.01
lga = LogisticRegression(max_iter = 10000, C = 0.01).fit(xt, yt)

## b

In [6]:
for c in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    tempb = LogisticRegression(max_iter = 10000, C = c).fit(xt, yt)
    print("C=",c,": ", gm(yv, tempb.predict(xv)))

C= 0.01 :  0.3142399778147813
C= 0.1 :  0.44147083282791677
C= 1 :  0.5372699811752707
C= 10 :  0.5547756582653556
C= 100 :  0.5578507090523395
C= 1000 :  0.5568128463691226
C= 10000 :  0.5568128463691226


In [7]:
# Geriausias su C = 100
lgb = LogisticRegression(max_iter = 10000, C = 100).fit(xt, yt)

# 2

# a

In [8]:
from sklearn.metrics import classification_report

In [9]:
lga = lga.fit(X_train_val, y_train_val)

In [10]:
print(classification_report(y_test, lga.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      7985
           1       0.74      0.12      0.21      1058

    accuracy                           0.89      9043
   macro avg       0.82      0.56      0.58      9043
weighted avg       0.88      0.89      0.86      9043



In [11]:
lgb = lgb.fit(X_train_val, y_train_val)

In [12]:
print(classification_report(y_test, lgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      7985
           1       0.66      0.32      0.43      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.65      0.69      9043
weighted avg       0.89      0.90      0.89      9043



## b

## c

In [13]:
from sklearn.metrics import roc_auc_score

### 1

### 2

In [14]:
roc_auc_score(y_test, lga.predict_proba(X_test)[:, 1])

0.7897904033200247

In [15]:
roc_auc_score(y_test, lgb.predict_proba(X_test)[:, 1])

0.8878532882424868

# 3

## a

In [16]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
for c in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    tempb = LogisticRegression(max_iter = 10000, C = c).fit(xt, yt)
    print("C=",c,": ", gm(yv, tempb.predict(xv)))

C= 0.01 :  0.3142399778147813
C= 0.1 :  0.44147083282791677
C= 1 :  0.5372699811752707
C= 10 :  0.5547756582653556
C= 100 :  0.5578507090523395
C= 1000 :  0.5568128463691226
C= 10000 :  0.5568128463691226


In [17]:
y = (y == 0).astype(int)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
xt, xv, yt, yv = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0, stratify=y_train_val)

for c in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    tempb = LogisticRegression(max_iter = 10000, C = c).fit(xt, yt)
    print("C=",c,": ", gm(yv, tempb.predict(xv)))

C= 0.01 :  0.30100391191871667
C= 0.1 :  0.4470806342705404
C= 1 :  0.5328094740679287
C= 10 :  0.5513736303432697
C= 100 :  0.552199755929885
C= 1000 :  0.5521555215599714
C= 10000 :  0.552199755929885


## b

In [18]:
lgb = LogisticRegression(max_iter = 10000, C = 100).fit(X_train_val, y_train_val)

In [19]:
print(classification_report(y_test, lgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.64      0.33      0.44      1058
           1       0.92      0.98      0.95      7985

    accuracy                           0.90      9043
   macro avg       0.78      0.65      0.69      9043
weighted avg       0.88      0.90      0.89      9043

