In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action = "ignore")
gc = pd.read_csv("C:\\Users\\Vaibhav\\Desktop\\03June\\DataSciencePy\\Datasets\\German_credit_classification.csv")
#Very famous German Credit Dataset 
#Whether a person will default on the credit or not - y variable is Default
#Step1. Data Exploration and Data Visualization
df = pd.DataFrame(gc)

In [2]:
gc.head(3)

Unnamed: 0,Default,checkingstatus1,duration,history,purpose,amount,savings,employ,installment,status,...,residence,property,age,otherplans,housing,cards,job,liable,tele,foreign
0,0,A11,6,A34,A43,1169,A65,A75,4,A93,...,4,A121,67,A143,A152,2,A173,1,A192,A201
1,1,A12,48,A32,A43,5951,A61,A73,2,A92,...,2,A121,22,A143,A152,1,A173,1,A191,A201
2,0,A14,12,A34,A46,2096,A61,A74,2,A93,...,3,A121,49,A143,A152,1,A172,2,A191,A201


---

### Creating dummy variables using pd.get_dummies

In [3]:
table = pd.crosstab(gc.checkingstatus1, gc.Default)
table = np.array(table)
pd.DataFrame(table/np.sum(table, axis = 1).reshape(-1,1), columns= ["Non-Default_Rate", "Default_Rate"])

Unnamed: 0,Non-Default_Rate,Default_Rate
0,0.507299,0.492701
1,0.609665,0.390335
2,0.777778,0.222222
3,0.883249,0.116751


In [4]:
#check for all the categorical variables

In [5]:
cat_vars = ['checkingstatus1', 'history', 'purpose', 'savings', 'employ', 'status', \
            'others', 'property', 'otherplans', 'housing','job','tele','foreign']

In [6]:
#Create dummies for the categorical columns
for cat in cat_vars:
    dummy = pd.get_dummies(gc[cat], prefix = cat+'_',drop_first=True)
    gc = pd.concat([gc,dummy], axis = 1)
print(gc.columns)
print(gc.shape)
gc.to_csv("dummy_German.csv")

Index(['Default', 'checkingstatus1', 'duration', 'history', 'purpose',
       'amount', 'savings', 'employ', 'installment', 'status', 'others',
       'residence', 'property', 'age', 'otherplans', 'housing', 'cards', 'job',
       'liable', 'tele', 'foreign', 'checkingstatus1__A12',
       'checkingstatus1__A13', 'checkingstatus1__A14', 'history__A31',
       'history__A32', 'history__A33', 'history__A34', 'purpose__A41',
       'purpose__A410', 'purpose__A42', 'purpose__A43', 'purpose__A44',
       'purpose__A45', 'purpose__A46', 'purpose__A48', 'purpose__A49',
       'savings__A62', 'savings__A63', 'savings__A64', 'savings__A65',
       'employ__A72', 'employ__A73', 'employ__A74', 'employ__A75',
       'status__A92', 'status__A93', 'status__A94', 'others__A102',
       'others__A103', 'property__A122', 'property__A123', 'property__A124',
       'otherplans__A142', 'otherplans__A143', 'housing__A152',
       'housing__A153', 'job__A172', 'job__A173', 'job__A174', 'tele__A192',
       

In [7]:
gc.drop(cat_vars, axis = 1, inplace=True)
gc.shape

(1000, 49)

---

### SUPPORT VECTOR MACHINE LINEAR - DEMONSTRATING PIPELINE FUNCTION
Note:SVM is a binary classifier

In [8]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
from sklearn.model_selection import train_test_split
train , test = train_test_split(gc, test_size = 0.30, random_state = 2) 
X_train = train.drop(['Default'], axis = 1)
y_train = train['Default']

X_test = test.drop(['Default'], axis = 1)
y_test = test['Default']
X_train.head(2)

Unnamed: 0,duration,amount,installment,residence,age,cards,liable,checkingstatus1__A12,checkingstatus1__A13,checkingstatus1__A14,...,property__A124,otherplans__A142,otherplans__A143,housing__A152,housing__A153,job__A172,job__A173,job__A174,tele__A192,foreign__A202
90,12,618,4,4,56,1,1,0,0,1,...,0,0,1,1,0,0,1,0,0,0
305,6,1543,4,2,33,1,1,0,0,1,...,0,0,1,1,0,0,1,0,0,0


In [10]:
X_test.head(2)

Unnamed: 0,duration,amount,installment,residence,age,cards,liable,checkingstatus1__A12,checkingstatus1__A13,checkingstatus1__A14,...,property__A124,otherplans__A142,otherplans__A143,housing__A152,housing__A153,job__A172,job__A173,job__A174,tele__A192,foreign__A202
37,18,2100,4,2,37,1,1,0,1,0,...,0,1,0,1,0,0,1,0,0,0
726,15,1316,2,2,47,2,1,0,0,1,...,0,0,1,1,0,1,0,0,0,0


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [12]:
model = SVC(C = 10,kernel='linear') # model to be fitted on the standardized data

In [13]:
svmclassifier = Pipeline([('scaler', StandardScaler()), ('liSvc', SVC(C = 10,kernel='linear'))])

In [14]:
svmclassifier.fit(train.drop('Default',axis = 1), train.Default)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('liSvc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
y_predict = svmclassifier.predict(test.drop(['Default'],axis = 1))

In [16]:
pd.crosstab(test.Default,y_predict)

col_0,0,1
Default,Unnamed: 1_level_1,Unnamed: 2_level_1
0,175,26
1,56,43


In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(test.Default,y_predict)

0.7266666666666667

In [18]:
#svmclassifier.decision_function_shape

In [19]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
fit = sc.fit(X_train)
X_train = fit.transform(X_train) 
X_test = fit.transform(X_test)

In [20]:
y_test = y_test.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)

In [21]:
np.round(X_train[0,:], 2)

array([0.14, 0.02, 1.  , 1.  , 0.66, 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ,
       0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 1.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 1.  , 0.  , 0.  ,
       1.  , 0.  , 0.  , 0.  ])

In [22]:
np.round(X_test[0,:],2)

array([0.25, 0.1 , 1.  , 0.33, 0.32, 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ,
       1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 1.  ,
       0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 1.  , 0.  , 0.  ,
       1.  , 0.  , 0.  , 0.  ])

In [23]:
print(y_train[:4])
print(y_test[:4])

[[0]
 [0]
 [0]
 [1]]
[[1]
 [0]
 [1]
 [1]]


In [24]:
model = SVC(kernel = "linear")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7266666666666667

In [31]:
l = list(np.logspace(-2,3,num = 5))
l

[0.01, 0.1778279410038923, 3.1622776601683795, 56.23413251903491, 1000.0]

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': l}
grid_svc = GridSearchCV(model, param_grid,scoring='accuracy', cv=5) 

In [33]:
grid_svc.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1778279410038923, 3.1622776601683795, 56.23413251903491, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [34]:
grid_svc.best_params_

{'C': 56.23413251903491}

In [35]:
y_pred = grid_svc.predict(X_test)

In [36]:
accuracy_score(y_test,y_pred)

0.7266666666666667

---

### SVM RADIAL

In [23]:
svmclassifier = SVC(C = 10,kernel='rbf', random_state=10, gamma=.1)

In [24]:
from sklearn.preprocessing import MinMaxScaler
svmclassifier = Pipeline([('scaler', MinMaxScaler()), ('liSvc', SVC(C = 10,kernel='rbf', random_state=10))])
#when binary variables are also present in data, minmaxscaling gives better results
#other-wise apply standard scaling on continuous data only

In [25]:
train.head(2) #note not standardized data

Unnamed: 0,Default,duration,amount,installment,residence,age,cards,liable,checkingstatus1__A12,checkingstatus1__A13,...,property__A124,otherplans__A142,otherplans__A143,housing__A152,housing__A153,job__A172,job__A173,job__A174,tele__A192,foreign__A202
90,0,12,618,4,4,56,1,1,0,0,...,0,0,1,1,0,0,1,0,0,0
305,0,6,1543,4,2,33,1,1,0,0,...,0,0,1,1,0,0,1,0,0,0


In [26]:
svmclassifier.fit(train.drop('Default',axis = 1), train.Default) 

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('liSvc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=10, shrinking=True,
  tol=0.001, verbose=False))])

In [29]:
y_pred = svmclassifier.predict(test.drop(['Default'],axis = 1))

In [30]:
pd.crosstab(test.Default,y_pred)

col_0,0,1
Default,Unnamed: 1_level_1,Unnamed: 2_level_1
0,180,21
1,59,40


In [31]:
accuracy_score(y_test, y_pred)

0.7333333333333333

In [32]:
#implementing grid seach cv
from sklearn.model_selection import GridSearchCV

In [39]:
model = SVC(C = 10,kernel='rbf', random_state=10, gamma=.1)
model.fit(X_train, train.Default.values)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=10, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
accuracy_score(test.Default.values, model.predict(X_test))

0.7066666666666667

In [34]:
model.decision_function_shape

'ovr'

In [35]:
print(np.round(X_test[0,:],1))
print(y_test[0])

array([0.2, 0.1, 1. , 0.3, 0.3, 0. , 0. , 0. , 1. , 0. , 0. , 1. , 0. ,
       0. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 1. , 0. , 0. , 0. , 0. ,
       1. , 0. , 1. , 0. , 0. , 1. , 0. , 0. , 0. ])

In [44]:
parameters = {'C':(.01,.1,1,10,100), 'gamma': (.0001,.001,1,.01,.1)}

In [56]:
grid_svc = GridSearchCV(estimator = model, param_grid=parameters,scoring='f1', cv=5) 

In [57]:
grid_svc.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=10, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': (0.01, 0.1, 1, 10, 100), 'gamma': (0.0001, 0.001, 1, 0.01, 0.1)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [58]:
grid_svc.best_params_

{'C': 100, 'gamma': 0.01}

In [50]:
ypred = grid_svc.predict(X_test)

In [51]:
accuracy_score(y_test,ypred)

0.74

In [53]:
from sklearn.metrics import f1_score
f1_score(test.Default,ypred)

0.4657534246575343

In [37]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(grid_svc,X_train,y_train, scoring='accuracy', cv = 5)
print(accuracy)
print("Accuracy of SVC is: " , accuracy.mean())

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Vaibhav\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-37-696c9684f3cf>", line 2, in <module>
    accuracy = cross_val_score(grid_svc,X_train,y_train, scoring='accuracy', cv = 5)
  File "C:\Users\Vaibhav\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 342, in cross_val_score
    pre_dispatch=pre_dispatch)
  File "C:\Users\Vaibhav\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 206, in cross_validate
    for train, test in cv.split(X, y, groups))
  File "C:\Users\Vaibhav\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\Vaibhav\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Vaib

KeyboardInterrupt: 