In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
X_test = pd.read_csv('X_test.csv')

In [3]:
X_train = pd.read_csv('X_train.csv')

In [4]:
y_test = pd.read_csv('y_test.csv')

In [5]:
y_train = pd.read_csv('y_train.csv')

In [6]:
from sklearn.linear_model import LogisticRegression

### Building the Model - Base

In [7]:
base_model = LogisticRegression()

In [8]:
base_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Assessing Random Forest Performance - Base Model

In [9]:
y_pred = base_model.predict(X_test)


In [10]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score

accuracy_score(y_test, y_pred)

0.8016928657799275

In [11]:
precision_score(y_test,y_pred)

0.7644787644787645

In [12]:
recall_score(y_test,y_pred)

0.6578073089700996

In [13]:
confusion_matrix(y_test,y_pred)

array([[465,  61],
       [103, 198]], dtype=int64)

### Feature Selection

In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [15]:
def embedded_log_reg_selector(X, y):
    # Your code goes here (Multiple lines)
    logreg = LogisticRegression(penalty='l1', solver='liblinear')
    embedded_lr_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', max_iter=50000), max_features=5)
    embedded_lr_selector = embedded_lr_selector.fit(X, y)
    embedded_lr_support = embedded_lr_selector.get_support()
    embedded_lr_feature = X.loc[:, embedded_lr_support].columns.tolist()
    # Your code ends here
    return embedded_lr_support, embedded_lr_feature

In [16]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X_train, y_train)
print(embedded_lr_feature)

['housing', 'loan', 'contact', 'duration', 'poutcome']


  y = column_or_1d(y, warn=True)


### Testing Accuracy with Feature Selection

In [17]:
feature_model = LogisticRegression()

In [18]:
selected_features = ['housing', 'loan', 'contact', 'duration', 'poutcome']
feature_model.fit(X_train[selected_features], y_train)

  y = column_or_1d(y, warn=True)


In [19]:
y_pred_feature = feature_model.predict(X_test[selected_features])

## Assessing Logistic Regression Performance - Model with Feature Selection

In [20]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score

accuracy_score(y_test, y_pred_feature)

0.777509068923821

In [21]:
precision_score(y_test, y_pred_feature)

0.7468354430379747

In [22]:
recall_score(y_test, y_pred_feature)

0.5880398671096345

In [23]:
confusion_matrix(y_test, y_pred_feature)

array([[466,  60],
       [124, 177]], dtype=int64)

### Parameter Tuning

In [24]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [25]:
# define models and parameters
parameter_model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [26]:
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=parameter_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [27]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.800205 using {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.800104 (0.020775) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.800004 (0.020257) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.800205 (0.020653) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.800103 (0.020697) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.798593 (0.021064) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.799802 (0.020466) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.796574 (0.019975) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.797381 (0.020660) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.796877 (0.020063) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.783058 (0.021972) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.782251 (0.022853) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.780229 (0.020928) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.765503 (0.019330) wit

In [32]:
logistic_best_parameters = LogisticRegression(C=100, penalty='l2', solver= 'liblinear')


In [33]:
logistic_best_parameters.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [34]:
y_pred_parameters = logistic_best_parameters.predict(X_test)

### Testing Accuracy of Tuned Parameters

In [35]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score

accuracy_score(y_test, y_pred_parameters)

0.8101571946795647

In [36]:
precision_score(y_test, y_pred_parameters)

0.7706766917293233

In [37]:
recall_score(y_test, y_pred_parameters)

0.6810631229235881

In [38]:
confusion_matrix(y_test,y_pred)

array([[465,  61],
       [103, 198]], dtype=int64)