In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# import missingno as mns (pip install missingno)

In [4]:
df = pd.read_csv('credit_customers.csv')

In [5]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [6]:
df.isnull().sum()

checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64

In [7]:
missing_column_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum()/ len(df)) * 100
total_missing_values = pd.concat([missing_column_values, missing_columns_per], axis = 1, keys = ['missing Values', 'Percentage'])
total_missing_values = total_missing_values.sort_values('Percentage', ascending= False)
total_missing_values.head(20)

Unnamed: 0,missing Values,Percentage
checking_status,0,0.0
property_magnitude,0,0.0
foreign_worker,0,0.0
own_telephone,0,0.0
num_dependents,0,0.0
job,0,0.0
existing_credits,0,0.0
housing,0,0.0
other_payment_plans,0,0.0
age,0,0.0


In [8]:
df.shape

(1000, 21)

In [9]:
df['class'].value_counts()

class
good    700
bad     300
Name: count, dtype: int64

In [10]:
from sklearn.utils import resample
df_good = df[df['class'] == 'good']
df_bad = df[df['class'] == 'bad']
df_sam = resample(df_bad, n_samples=700)
df = pd.concat([df_good,df_sam], ignore_index = True)
df = df.sample(frac = 1)
df['class'].value_counts()

class
bad     700
good    700
Name: count, dtype: int64

In [11]:
encoder = LabelEncoder()
categorical_data = df.select_dtypes(include = ['object', 'category'])
for x in categorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
1176,0,24.0,2,3,2064.0,2,4,3.0,0,2,...,1,34.0,1,1,1.0,0,1.0,1,1,0
1225,3,24.0,2,4,2538.0,2,3,4.0,3,2,...,0,47.0,1,1,2.0,3,2.0,0,1,0
879,2,21.0,1,2,2319.0,2,2,2.0,1,2,...,0,33.0,1,2,1.0,1,1.0,0,1,0
327,3,6.0,3,6,518.0,2,0,3.0,0,2,...,3,29.0,1,1,1.0,1,1.0,0,1,1
1043,1,12.0,3,3,7865.0,2,3,4.0,3,2,...,2,53.0,1,0,1.0,0,1.0,1,1,0


In [12]:
x = df.drop('class', axis = 1)
y = df['class']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.2, random_state = 1)

In [13]:
model1 = LogisticRegression(solver= 'newton-cholesky')
model1.fit(xtrain,ytrain)

In [14]:
pred1 = model1.predict(xtest)
print('\nclasification report\n', classification_report(ytest,pred1))
print('\nAccuracy Report\n', accuracy_score(ytest,pred1))
print('\nPrecision Report\n', precision_score(ytest,pred1))
print('\nRecall  Report\n', recall_score(ytest,pred1)) 
print('\nF1 Score Report\n', f1_score(ytest,pred1))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred1))



clasification report
               precision    recall  f1-score   support

           0       0.63      0.65      0.64       136
           1       0.66      0.64      0.65       144

    accuracy                           0.64       280
   macro avg       0.64      0.64      0.64       280
weighted avg       0.64      0.64      0.64       280


Accuracy Report
 0.6428571428571429

Precision Report
 0.6571428571428571

Recall  Report
 0.6388888888888888

F1 Score Report
 0.647887323943662

Confusion Matrix Report
 [[88 48]
 [52 92]]


In [15]:
model2 = RandomForestClassifier()
model2.fit(xtrain,ytrain)


In [16]:
pred2 = model2.predict(xtest)
print('\nclasification report\n', classification_report(ytest,pred2))
print('\nAccuracy Report\n', accuracy_score(ytest,pred2))
print('\nPrecision Report\n', precision_score(ytest,pred2))
print('\nRecall  Report\n', recall_score(ytest,pred2)) 
print('\nF1 Score Report\n', f1_score(ytest,pred2))
print('\nConfusion Matrix Report\n', confusion_matrix(ytest,pred2))



clasification report
               precision    recall  f1-score   support

           0       0.83      0.92      0.87       136
           1       0.91      0.82      0.86       144

    accuracy                           0.87       280
   macro avg       0.87      0.87      0.87       280
weighted avg       0.87      0.87      0.87       280


Accuracy Report
 0.8678571428571429

Precision Report
 0.9147286821705426

Recall  Report
 0.8194444444444444

F1 Score Report
 0.8644688644688645

Confusion Matrix Report
 [[125  11]
 [ 26 118]]


In [17]:
x.iloc[[0]]

model2.predict([[3.0, 63.0, 1.0, 6.0, 3342.0, 4.0, 3.0, 4.0, 3.0, 2.0,2.0, 0.0, 51.0, 1.0, 1.0, 1.0, 1.0, 1.0,1.0,1.0]])



array([1])

In [18]:
df = pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101.0,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300.0,0.0,0,no
1,yes,adult,534817.0,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208.0,0.0,0,no
2,no,adult,530334.0,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0.0,0.0,0,yes
3,yes,young,5290409.0,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208.0,0.0,0,yes
4,no,adult,530255.0,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300.0,0.0,0,no


In [19]:
df.isnull().sum()

surgery                    6
age                        6
hospital_number            6
rectal_temp               65
pulse                     30
respiratory_rate          64
temp_of_extremities       61
peripheral_pulse          73
mucous_membrane           52
capillary_refill_time     37
pain                      60
peristalsis               49
abdominal_distention      60
nasogastric_tube         107
nasogastric_reflux       109
nasogastric_reflux_ph    246
rectal_exam_feces        107
abdomen                  124
packed_cell_volume        35
total_protein             38
abdomo_appearance        168
abdomo_protein           201
outcome                    6
surgical_lesion            6
lesion_1                   6
lesion_2                   6
lesion_3                   0
cp_data                    0
dtype: int64

In [20]:
# TO FILL UP THE MISSING VALUES USING LOOP 
numerical_data = df.select_dtypes(include = ['int', 'float'])
categorical_data = df.select_dtypes(include = ['object', 'category'])
for x in numerical_data:
    df[x].fillna(np.mean(df[x]), inplace = True) # TO FILLUP THE NUMERICL DATA WITHT THE MEAN USING LOOP
    
for x in categorical_data:
    df[x].fillna(df[x].mode()[0], inplace = True) # TO FILL UP THE STRAIN VARIABLE USING THE MODE

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]), inplace = True) # TO FILLUP THE NUMERICL DATA WITHT THE MEAN USING LOOP
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace = True) # TO FILL UP THE STRAIN VARIABLE USING THE MODE


In [21]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [22]:
# 1. clean
# 2. run the model but there is an error so we have to rectify it first
# note that y = outcome
df['outcome'].value_counts()

outcome
lived         182
died           76
euthanized     41
Name: count, dtype: int64

In [23]:
x = df.drop('outcome', axis = 1)
y = df['outcome']
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.15, random_state = 1)

In [35]:
from sklearn.utils import resample
df_lived = df[df['outcome'] == 'lived']
df_died = df[df['outcome'] == 'died']
df_euthanized = df[df['outcome'] == 'euthanized']
df_sam = resample([df_euthanized], n_samples=182)
df_sam1 = resample([df_died], n_samples=182)
df = pd.concat([df_lived,df_sam,df_sam1], ignore_index = True)
df = df.sample(frac = 1)
df['outcome'].value_counts()

KeyError: 'outcome'

In [24]:
model12 = RandomForestClassifier()
model12.fit(xtrain,ytrain)

ValueError: could not convert string to float: 'yes'