## 1. EXPLORATORY DATA  ANALYSIS

1.1 UNDERSTANDING THE DATA

In [None]:
# importing libraries
import numpy as np                                                                                                                                                                                                                                                                                                                                                                                                          
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [None]:
# reading data into the dataframe
df = pd.read_csv("C:\\Users\\kanan\\Downloads\\data.csv")

In [None]:
# displaying first five rows
df.head()

In [None]:
# shape of the dataframe
df.shape

In [None]:
# concise summary of dataframe
df.info()

In [None]:
# column names
df.columns

In [None]:
# checking for null values
df.isnull().sum()

In [None]:
# dropping 'Unnamed: 32' column.
df.drop("Unnamed: 32", axis=1, inplace=True)

In [None]:
# dropping id column
df.drop('id',axis=1, inplace=True)

In [None]:
# descriptive statistics of data
df.describe()

1.2 DATA VISUALIZATIONS

In [None]:
# countplot
plt.figure(figsize = (8,7))
sns.countplot(x="diagnosis", data=df, palette='magma')

In [None]:
# heatmap
plt.figure(figsize=(20,18))
sns.heatmap(df.corr(), annot=True, linewidths=.5, cmap="Purples")

In [None]:
df.columns

In [None]:
# Getting Mean Columns with diagnosis
m_col = ['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Getting Se Columns with diagnosis
s_col = ['diagnosis','radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se']

# Getting Worst column with diagnosis
w_col = ['diagnosis','radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

FOR MEAN COLUMNS

In [None]:
# pairplot for mean columns
sns.pairplot(df[m_col],hue = 'diagnosis', palette='Blues')

FOR SE COLUMNS

In [None]:
# pairplot for se columns
sns.pairplot(df[s_col],hue = 'diagnosis', palette='Greens')

FOR WORST COLUMNS

In [None]:
# pairplot for worst columns
sns.pairplot(df[w_col],hue = 'diagnosis', palette='Oranges')

## 2. DATA PREPROCESSING AND BUILDING MODELS

2.1 DATA PREPROCESSING

In [None]:
# counts of unique rows in the 'diagnosis' column
df['diagnosis'].value_counts()

In [None]:
# mapping categorical values to numerical values
df['diagnosis']=df['diagnosis'].map({'B':0,'M':1})

In [None]:
df['diagnosis'].value_counts()

2.2 SPLITTING THE DATA INTO TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split

# splitting data
X_train, X_test, y_train, y_test = train_test_split(
                df.drop('diagnosis', axis=1),
                df['diagnosis'],
                test_size=0.2,
                random_state=42)

print("Shape of training set:", X_train.shape)
print("Shape of test set:", X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

# 2.3 CLASSIFICATION MODELS

2.3.1 LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
predictions1 = logreg.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix: \n", confusion_matrix(y_test, predictions1))
print('\n')
print(classification_report(y_test, predictions1))

In [None]:
from sklearn.metrics import accuracy_score

logreg_acc = accuracy_score(y_test, predictions1)
print("Accuracy of the Logistic Regression Model is: ", logreg_acc)

2.3.2 K NEAREST NEIGHBOURS

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# to find which value shows the lowest mean error
error_rate = []

for i in range(1,42):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(range(1,42), error_rate, color='purple', linestyle="--",
         marker='o', markersize=10, markerfacecolor='b')
plt.title('Error_Rate vs K-value')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
predictions2 = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test, predictions2))
print("\n")
print(classification_report(y_test, predictions2))

In [None]:
knn_model_acc = accuracy_score(y_test, predictions2)
print("Accuracy of K Neighbors Classifier Model is: ", knn_model_acc)

2.3.3 RANDOM FORESTS

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)
predictions4 = rfc.predict(X_test)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions4))
print("\n")
print(classification_report(y_test, predictions4))

In [None]:
rfc_acc = accuracy_score(y_test, predictions4)
print("Accuracy of Random Forests Model is: ", rfc_acc)

2.3.4 SUPPORT VECTOR MACHINES(SVM)

In [None]:
from sklearn.svm import SVC

svc_model = SVC(kernel="poly")
svc_model.fit(X_train, y_train)
predictions5 = svc_model.predict(X_test)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions5))
print("\n")
print(classification_report(y_test, predictions5))

In [None]:
svm_acc = accuracy_score(y_test, predictions5)
print("Accuracy of SVM model is: ", svm_acc)

## 3. FINAL RESULTS

In [None]:
print(logreg_acc)
print(knn_model_acc)
print(rfc_acc)
print(svm_acc)

In [None]:
plt.figure(figsize=(12,6))
model_acc = [logreg_acc, knn_model_acc, rfc_acc, svm_acc]
model_name = ['LogisticRegression', 'KNN', 'RandomForests', 'SVM']
sns.barplot(x= model_acc, y=model_name, palette='magma')