In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('crx.data', delimiter=",", header=None, na_values=["?"], names=['0', 'First', 'Second', '3','4', '5', '6', 'Seventh','8', '9', 'Tenth', '11','12', 'Thirteen', 'Fourteen', 'Approval'])

In [3]:
data = data.dropna(axis='rows')

In [4]:
data = data.reset_index()

In [5]:
data = data.drop(['index'], axis = 'columns')

In [6]:
data.head()

Unnamed: 0,0,First,Second,3,4,5,6,Seventh,8,9,Tenth,11,12,Thirteen,Fourteen,Approval
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [7]:
#-----------------------------

In [8]:
#Encoding labels

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   0         653 non-null    object 
 1   First     653 non-null    float64
 2   Second    653 non-null    float64
 3   3         653 non-null    object 
 4   4         653 non-null    object 
 5   5         653 non-null    object 
 6   6         653 non-null    object 
 7   Seventh   653 non-null    float64
 8   8         653 non-null    object 
 9   9         653 non-null    object 
 10  Tenth     653 non-null    int64  
 11  11        653 non-null    object 
 12  12        653 non-null    object 
 13  Thirteen  653 non-null    float64
 14  Fourteen  653 non-null    int64  
 15  Approval  653 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 81.8+ KB


In [10]:
encoded_data = data.copy()

In [11]:
cols = ['0', '3', '4', '5', '6', '8', '9', '11', '12']
def encoding(df):
    for col in cols:
        df[col] = le.fit_transform(df[col])
    return(df)

In [12]:
encoded_data = encoding(encoded_data)

In [13]:
encoded_data.Approval = encoded_data.Approval.replace({'+':1, '-':0})

In [14]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   0         653 non-null    int32  
 1   First     653 non-null    float64
 2   Second    653 non-null    float64
 3   3         653 non-null    int32  
 4   4         653 non-null    int32  
 5   5         653 non-null    int32  
 6   6         653 non-null    int32  
 7   Seventh   653 non-null    float64
 8   8         653 non-null    int32  
 9   9         653 non-null    int32  
 10  Tenth     653 non-null    int64  
 11  11        653 non-null    int32  
 12  12        653 non-null    int32  
 13  Thirteen  653 non-null    float64
 14  Fourteen  653 non-null    int64  
 15  Approval  653 non-null    int64  
dtypes: float64(4), int32(9), int64(3)
memory usage: 58.8 KB


In [15]:
data.head()

Unnamed: 0,0,First,Second,3,4,5,6,Seventh,8,9,Tenth,11,12,Thirteen,Fourteen,Approval
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [16]:
encoded_data.head()

Unnamed: 0,0,First,Second,3,4,5,6,Seventh,8,9,Tenth,11,12,Thirteen,Fourteen,Approval
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202.0,0,1
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43.0,560,1
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280.0,824,1
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100.0,3,1
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120.0,0,1


In [17]:
#-----------------------------

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [19]:
X = encoded_data.drop(['Approval'], axis = 'columns')
X_scaled = preprocessing.scale(X)
y = encoded_data.Approval

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [22]:
pipeline_lr=Pipeline([('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [23]:
pipeline_randomforest=Pipeline([('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [24]:
pipeline_dt=Pipeline([('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [25]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [26]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

In [27]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [28]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8244274809160306
Decision Tree Test Accuracy: 0.7175572519083969
RandomForest Test Accuracy: 0.7709923664122137


In [29]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Logistic Regression


Logistic Regression

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

In [31]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [32]:
grid = {        "penalty": ['l2'],
                 "C": np.logspace(0, 4, 10),
                 "solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 }

In [33]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
lg = LogisticRegression()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
lg_random = RandomizedSearchCV(estimator = lg, param_distributions = grid, n_iter = 50, cv = 3, verbose=2, random_state=0, n_jobs = 1)
# Fit the random search model
lg_random.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] solver=newton-cg, penalty=l2, C=1.0 .............................
[CV] .............. solver=newton-cg, penalty=l2, C=1.0, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=1.0 .............................
[CV] .............. solver=newton-cg, penalty=l2, C=1.0, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=1.0 .............................
[CV] .............. solver=newton-cg, penalty=l2, C=1.0, total=   0.0s
[CV] solver=saga, penalty=l2, C=1.0 ..................................
[CV] ................... solver=saga, penalty=l2, C=1.0, total=   0.0s
[CV] solver=saga, penalty=l2, C=1.0 ..................................
[CV] ................... solver=saga, penalty=l2, C=1.0, total=   0.0s
[CV] solver=saga, penalty=l2, C=1.0 ..................................
[CV] ................... solver=saga, penalty=l2, C=1.0, total=   0.0s
[CV] solver=sag, penalty=l2, C=1.0 ...................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] .... solver=saga, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=sag, penalty=l2, C=2.7825594022071245 ....................
[CV] ..... solver=sag, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=sag, penalty=l2, C=2.7825594022071245 ....................
[CV] ..... solver=sag, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=sag, penalty=l2, C=2.7825594022071245 ....................
[CV] ..... solver=sag, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=liblinear, penalty=l2, C=2.7825594022071245 ..............
[CV]  solver=liblinear, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=liblinear, penalty=l2, C=2.7825594022071245 ..............
[CV]  solver=liblinear, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=liblinear, penalty=l2, C=2.7825594022071245 ..............
[CV]  solver=liblinear, penalty=l2, C=2.7825594022071245, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=7.742636826811269 ...............
[CV



[CV]  solver=liblinear, penalty=l2, C=21.544346900318832, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=59.94842503189409 ...............
[CV]  solver=newton-cg, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=59.94842503189409 ...............
[CV]  solver=newton-cg, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=59.94842503189409 ...............
[CV]  solver=newton-cg, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=saga, penalty=l2, C=59.94842503189409 ....................
[CV] ..... solver=saga, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=saga, penalty=l2, C=59.94842503189409 ....................
[CV] ..... solver=saga, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=saga, penalty=l2, C=59.94842503189409 ....................
[CV] ..... solver=saga, penalty=l2, C=59.94842503189409, total=   0.0s
[CV] solver=sag, penalty=l2, C=59.94842503189409 .....................
[CV] 



[CV]  solver=newton-cg, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=464.15888336127773 ..............
[CV]  solver=newton-cg, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=saga, penalty=l2, C=464.15888336127773 ...................
[CV] .... solver=saga, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=saga, penalty=l2, C=464.15888336127773 ...................
[CV] .... solver=saga, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=saga, penalty=l2, C=464.15888336127773 ...................
[CV] .... solver=saga, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=sag, penalty=l2, C=464.15888336127773 ....................
[CV] ..... solver=sag, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=sag, penalty=l2, C=464.15888336127773 ....................
[CV] ..... solver=sag, penalty=l2, C=464.15888336127773, total=   0.0s
[CV] solver=sag, penalty=l2, C=464.15888336127773 ....................
[CV]



[CV]  solver=newton-cg, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=newton-cg, penalty=l2, C=3593.813663804626 ...............
[CV]  solver=newton-cg, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=saga, penalty=l2, C=3593.813663804626 ....................
[CV] ..... solver=saga, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=saga, penalty=l2, C=3593.813663804626 ....................
[CV] ..... solver=saga, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=saga, penalty=l2, C=3593.813663804626 ....................
[CV] ..... solver=saga, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=sag, penalty=l2, C=3593.813663804626 .....................
[CV] ...... solver=sag, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=sag, penalty=l2, C=3593.813663804626 .....................
[CV] ...... solver=sag, penalty=l2, C=3593.813663804626, total=   0.0s
[CV] solver=sag, penalty=l2, C=3593.813663804626 .....................
[CV] .

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.8s finished


RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=50, n_jobs=1,
                   param_distributions={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                                        'penalty': ['l2'],
                                        'solver': ['newton-cg', 'saga', 'sag',
                                                   'liblinear']},
                   random_state=0, verbose=2)

In [34]:
print(lg_random.best_params_)
print("The mean accuracy of the model is:", lg_random.score(X_test, y_test))

{'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0}
The mean accuracy of the model is: 0.8931297709923665


In [35]:
base_model = LogisticRegression(random_state = 0)
base_model.fit(X_train, y_train)
base_result = base_model.predict(X_test)

In [36]:
best_random = LogisticRegression(random_state = 0, solver = 'newton-cg', penalty = 'l2', C = 1)
best_random.fit(X_train, y_train)
best_result = best_random.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [38]:
accuracy_score(best_result, y_test)

0.8931297709923665

So I picked LogisticRegression with such parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.0}
Accuracy of the model is 89%

In [39]:
#----------------------

In [40]:
from sklearn.svm import SVC

In [41]:
svc = SVC(gamma = 'auto', kernel = 'rbf')
svc.fit(X_train, y_train)
svc_result = svc.predict(X_test)
accuracy_score(svc_result, y_test)

0.8702290076335878

In [42]:
#---------------------

In [43]:
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

In [44]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle = True)

In [45]:
# create the sub models
estimators = []
model1 = LogisticRegression(random_state = 0, solver = 'newton-cg', penalty = 'l2', C = 1)
estimators.append(('logistic', model1))
model2 = SVC(gamma = 'auto', kernel = 'rbf')
estimators.append(('svm', model2))

In [46]:
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X_scaled, y, cv=kfold)
print(results.mean())

0.8637296037296037


In [47]:
#--------------------

In [48]:
ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('logistic',
                              LogisticRegression(C=1, random_state=0,
                                                 solver='newton-cg')),
                             ('svm', SVC(gamma='auto'))])

In [49]:
ens_result = ensemble.predict(X_test)

In [50]:
accuracy_score(ens_result, y_test)

0.8702290076335878

In [51]:
#-----------------------

In [92]:
def credit_approval(row):
    df = pd.concat([data.drop(['Approval'], axis = 'columns'), row])
    encoder = preprocessing.LabelEncoder()
    df.loc[:, df.dtypes == np.object] = df.loc[:, df.dtypes == np.object].apply(encoder.fit_transform)
    df = preprocessing.scale(df)
    scaled_encoded_row = df[-1]
    scaled_encoded_row = scaled_encoded_row.reshape(1, -1)
    
    estimators = []
    model1 = LogisticRegression(random_state = 0, solver = 'newton-cg', penalty = 'l2', C = 1)
    estimators.append(('logistic', model1))
    model2 = SVC(gamma = 'auto', kernel = 'rbf')
    estimators.append(('svm', model2))
    
    ensemble = VotingClassifier(estimators)
    ensemble.fit(X_scaled, y)
    result = ensemble.predict(scaled_encoded_row)
#     return result
    if result == 1:
        row['Approval'] = '+'
        print(row)
    else:
        row['Approval'] = '-'
        print(row)

In [96]:
data_1 = pd.read_csv('test_row.txt', delimiter=",", header=None, na_values=["?"], names=['0', 'First', 'Second', '3','4', '5', '6', 'Seventh','8', '9', 'Tenth', '11','12', 'Thirteen', 'Fourteen'])

In [97]:
data_1

Unnamed: 0,0,First,Second,3,4,5,6,Seventh,8,9,Tenth,11,12,Thirteen,Fourteen
0,b,34.0,5.5,y,p,c,v,1.5,f,f,0,t,g,60,0


In [98]:
credit_approval(data_1)

   0  First  Second  3  4  5  6  Seventh  8  9  Tenth 11 12  Thirteen  \
0  b   34.0     5.5  y  p  c  v      1.5  f  f      0  t  g        60   

   Fourteen Approval  
0         0        -  
