# Data Set Information:

There are 10 predictors, all quantitative, and a binary dependent variable, indicating the presence or absence of breast cancer.
The predictors are anthropometric data and parameters which can be gathered in routine blood analysis.
Prediction models based on these predictors, if accurate, can potentially be used as a biomarker of breast cancer.


Attribute Information:

Quantitative Attributes:
Age (years)
BMI (kg/m2)
Glucose (mg/dL)
Insulin (µU/mL)
HOMA
Leptin (ng/mL)
Adiponectin (µg/mL)
Resistin (ng/mL)
MCP-1(pg/dL)

Labels:
1=Healthy controls
2=Patients

# classification

In [1]:
# import pandas as pd
import pandas as pd

In [2]:
# read the data set
df=pd.read_csv('dataR2.csv')
df.head(10)

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1
5,49,22.854458,92,3.226,0.732087,6.8317,13.67975,10.3176,530.41,1
6,89,22.7,77,4.69,0.890787,6.964,5.589865,12.9361,1256.083,1
7,76,23.8,118,6.47,1.883201,4.311,13.25132,5.1042,280.694,1
8,73,22.0,97,3.35,0.801543,4.47,10.358725,6.28445,136.855,1
9,75,23.0,83,4.952,1.013839,17.127,11.57899,7.0913,318.302,1


In [3]:
# to find no of rows and columns
df.shape

(116, 10)

In [4]:
# to find null values
df.isnull().sum()

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64

In [5]:
# converting the float and string values into integers using functions
from sklearn.preprocessing import LabelEncoder
def lbc():
    lbc=LabelEncoder()
    for col in df.columns:
        df[col]=lbc.fit_transform(df[col])
        
lbc()
df.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,17,33,1,3,0,14,68,37,53,0
1,47,5,19,9,13,15,26,4,57,0
2,46,30,18,33,34,50,108,45,67,0
3,35,14,5,11,7,21,41,66,104,0
4,49,11,19,19,23,4,20,56,93,0


In [6]:
# intialising the x and y values
y=df['Classification']
x=df.drop('Classification',axis=1)

In [7]:
# train and test the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

# logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
lg.fit(x_train,y_train)

y_pred=lg.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)



0.7428571428571429

# Naive Bayes Classifier

In [9]:
from sklearn.naive_bayes import GaussianNB
g=GaussianNB()
g.fit(x_train,y_train)


y_pred=g.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.6857142857142857

# Nearest Neighbor

In [10]:
from sklearn.neighbors import KNeighborsClassifier
kn=KNeighborsClassifier()
kn.fit(x_train,y_train)

y_pred=kn.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.8

# support vector machines

In [11]:
from sklearn.svm import SVC
sv=SVC()
sv.fit(x_train,y_train)

y_pred=sv.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)



0.5142857142857142

# Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
tree_clf=DecisionTreeClassifier()
tree_clf.fit(x_train,y_train)

y_pred=tree_clf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.8

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
R=RandomForestClassifier()
R.fit(x_train,y_train)

y_pred=R.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)



0.6571428571428571

# program

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# read the dataset
df=pd.read_csv(input('enter the dataset: '))


# converting the float and string values into integers using functions
def lbc():
    lbc=LabelEncoder()
    for col in df.columns:
        df[col]=lbc.fit_transform(df[col])
lbc()

# intializing the attribute for training
i=input('enter the attribute:')
y=df[i]
x=df.drop(i,axis=1)

# test and train the give dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)


# find the accuracy score for logistic regression and it is stored in variable lg_acc
lg=LogisticRegression()
lg.fit(x_train,y_train)
y_pred=lg.predict(x_test)
lg_acc=accuracy_score(y_pred,y_test)


# find the accuracy score for Naive_bayes and it is stored in variable g_acc
g=GaussianNB()
g.fit(x_train,y_train)
y_pred=g.predict(x_test)
g_acc=accuracy_score(y_pred,y_test)


# find the accuracy score for KNeighborsClassifier and it is stored in variable kn_acc
kn=KNeighborsClassifier()
kn.fit(x_train,y_train)
y_pred=kn.predict(x_test)
kn_acc=accuracy_score(y_pred,y_test)


# find the accuracy score for SVC and it is stored in variable sv_acc
sv=SVC()
sv.fit(x_train,y_train)
y_pred=sv.predict(x_test)
sv_acc=accuracy_score(y_pred,y_test)

# find the accuracy score for DecisionTreeClassifier and it is stored in variable tree_acc
tree_clf=DecisionTreeClassifier()
tree_clf.fit(x_train,y_train)
y_pred=tree_clf.predict(x_test)
tree_acc=accuracy_score(y_pred,y_test)

# find the accuracy score for RandomForestClassifier and it is stored in variable R_acc
R=RandomForestClassifier()
R.fit(x_train,y_train)
y_pred=R.predict(x_test)
R_acc=accuracy_score(y_pred,y_test)

# print the all the classifiers accuracies
print('accuracy of logicstic regression is:',lg_acc)
print('accuracy of naive_bayes_Classifier is:',g_acc)
print('accuracy of KNeighborsClassifier is:',kn_acc)
print('accuracy of SVC is:',sv_acc)
print('accuracy of DecisionTreeClassifier is:',tree_acc)
print('accuracy of RandomForestClassifier is:',R_acc)



# convert all the accuracy values into list
li=[lg_acc,g_acc,kn_acc,sv_acc,tree_acc,R_acc]
# print the list of the all accuracy values
print('list of the accuracy:',li)
# print best accuracy value
print('best accuracy value of given data is:',max(li))

enter the dataset: dataR2.csv
enter the attribute:Classification
accuracy of logicstic regression is: 0.7428571428571429
accuracy of naive_bayes_Classifier is: 0.6857142857142857
accuracy of KNeighborsClassifier is: 0.8
accuracy of SVC is: 0.5142857142857142
accuracy of DecisionTreeClassifier is: 0.7428571428571429
accuracy of RandomForestClassifier is: 0.6285714285714286
list of the accuracy: [0.7428571428571429, 0.6857142857142857, 0.8, 0.5142857142857142, 0.7428571428571429, 0.6285714285714286]
best accuracy value of given data is: 0.8


