classification


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [2]:
df=pd.read_csv('credit_customers.csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [None]:
missing_columns_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum()/ len(df)) * 100
total_missing_values = pd.concat([missing_columns_values, missing_columns_per], axis = 1, keys = ['Missing Values', 'Percentage'])
total_missing_values = total_missing_values.sort_values('Percentage', ascending=False)
total_missing_values.head(20)

Unnamed: 0,Missing Values,Percentage
checking_status,0,0.0
property_magnitude,0,0.0
foreign_worker,0,0.0
own_telephone,0,0.0
num_dependents,0,0.0
job,0,0.0
existing_credits,0,0.0
housing,0,0.0
other_payment_plans,0,0.0
age,0,0.0


In [4]:
df.columns.unique()

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker', 'class'],
      dtype='object')

In [5]:
#classification task has a problem caled "class balancing"

check the shape of data


In [6]:
df.shape

(1000, 21)

check the value count of the class

In [7]:
df['class'].value_counts()

class
good    700
bad     300
Name: count, dtype: int64

In [8]:
#to balance the data class
from sklearn.utils import resample
df_good=df[df['class']=='good']
df_bad=df[df['class']=='bad']
df_sam=resample(df_bad,n_samples=700)
df=pd.concat([df_good,df_sam],ignore_index=True)
df=df.sample(frac=1)
df['class'].value_counts()


class
good    700
bad     700
Name: count, dtype: int64

In [9]:
encoder = LabelEncoder()
categorical_data = df.select_dtypes(include=['object','category'])
for x in categorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
369,3,36.0,3,9,4686.0,2,0,2.0,3,2,...,2,32.0,1,0,1.0,0,1.0,1,1,1
294,3,14.0,2,4,802.0,2,0,4.0,3,2,...,0,27.0,1,1,2.0,3,1.0,0,1,1
730,3,18.0,3,4,6761.0,4,0,2.0,3,2,...,0,68.0,1,2,2.0,1,1.0,0,1,0
1057,1,24.0,3,4,1442.0,2,1,4.0,0,2,...,0,23.0,1,2,2.0,1,1.0,0,1,0
252,0,24.0,3,6,3758.0,1,4,1.0,0,2,...,2,23.0,1,2,1.0,2,1.0,0,1,1


In [10]:
x=df.drop('class',axis=1)
y=df['class']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)

machine learning models

In [11]:
model1=LogisticRegression(solver='newton-cholesky')
model1.fit(xtrain,ytrain)

prediction

In [12]:
pred1=model1.predict(xtest)
print('\nClassification Report',classification_report(ytest,pred1))
print('\nAccuracy Report\n',accuracy_score(ytest,pred1))
print('\nPrecision Report\n',precision_score(ytest,pred1))
print('\nRecall Report\n',recall_score(ytest,pred1))
print('\nF1 score Report\n',f1_score(ytest,pred1))
print('\nConfusion metrix Report\n',confusion_matrix(ytest,pred1))



Classification Report               precision    recall  f1-score   support

           0       0.70      0.67      0.69       142
           1       0.68      0.71      0.69       138

    accuracy                           0.69       280
   macro avg       0.69      0.69      0.69       280
weighted avg       0.69      0.69      0.69       280


Accuracy Report
 0.6892857142857143

Precision Report
 0.6758620689655173

Recall Report
 0.7101449275362319

F1 score Report
 0.6925795053003534

Confusion metrix Report
 [[95 47]
 [40 98]]


In [13]:
model2=DecisionTreeClassifier()
model2.fit(xtrain,ytrain)

In [14]:
pred2=model2.predict(xtest)
print('\nClassification Report',classification_report(ytest,pred2))
print('\nAccuracy Report\n',accuracy_score(ytest,pred2))
print('\nPrecision Report\n',precision_score(ytest,pred2))
print('\nRecall Report\n',recall_score(ytest,pred2))
print('\nF1 score Report\n',f1_score(ytest,pred2))
print('\nConfusion metrix Report\n',confusion_matrix(ytest,pred2))


Classification Report               precision    recall  f1-score   support

           0       0.77      0.94      0.85       142
           1       0.92      0.71      0.80       138

    accuracy                           0.83       280
   macro avg       0.85      0.83      0.83       280
weighted avg       0.85      0.83      0.83       280


Accuracy Report
 0.8285714285714286

Precision Report
 0.9245283018867925

Recall Report
 0.7101449275362319

F1 score Report
 0.8032786885245902

Confusion metrix Report
 [[134   8]
 [ 40  98]]


In [15]:
model3=RandomForestClassifier()
model3.fit(xtrain,ytrain)

In [16]:
pred3=model3.predict(xtest)
print('\nClassification Report',classification_report(ytest,pred3))
print('\nAccuracy Report\n',accuracy_score(ytest,pred3))
print('\nPrecision Report\n',precision_score(ytest,pred3))
print('\nRecall Report\n',recall_score(ytest,pred3))
print('\nF1 score Report\n',f1_score(ytest,pred3))
print('\nConfusion metrix Report\n',confusion_matrix(ytest,pred3))


Classification Report               precision    recall  f1-score   support

           0       0.91      0.94      0.92       142
           1       0.93      0.91      0.92       138

    accuracy                           0.92       280
   macro avg       0.92      0.92      0.92       280
weighted avg       0.92      0.92      0.92       280


Accuracy Report
 0.9214285714285714

Precision Report
 0.9328358208955224

Recall Report
 0.9057971014492754

F1 score Report
 0.9191176470588235

Confusion metrix Report
 [[133   9]
 [ 13 125]]


deployment

In [17]:
x.iloc[[0]]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
369,3,36.0,3,9,4686.0,2,0,2.0,3,2,2.0,2,32.0,1,0,1.0,0,1.0,1,1


In [18]:
model3.predict([[]])



ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by RandomForestClassifier.