## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing Dataset

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-state/breast-cancer-wisconsin.data')
df.head()

## Adding Columns

In [None]:
df.columns = ['Id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
              'Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
              'Bland Chromatin','Normal Nucleoli','Mitoses','Class']
df.head()

## Info About Columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Class'] = df['Class'].replace({2: 0, 4: 1})
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

## Changing Datatype for Bare Nuclei Column

In [None]:
df = df.replace('?', np.nan)
df['Bare Nuclei']= pd.to_numeric(df['Bare Nuclei'])
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df = df.fillna(df.mean())
df.isna().sum()

## Exploratory Data Analysis

In [None]:
df = df.drop("Id",axis=1)
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(10,6))
# Plot the heatmap with the mask
sns.heatmap(corr, mask=mask, cmap="YlGnBu", annot=True, fmt=".2f")

plt.show()

## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("Class",axis=1), 
                                                   df['Class'], test_size=0.20, random_state = 10)
lg = LogisticRegression()

## Model Training

In [None]:
lg.fit(X_train, y_train)
lg_preditcions = lg.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Print confusion matrix
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, lg_preditcions))

In [None]:
rf_cf_matrix = confusion_matrix(y_test, lg_preditcions)
group_names_rf = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
group_counts_rf = ["{0:0.0f}".format(value) for value in rf_cf_matrix.flatten()]
group_percentages_rf = ["{0:.2%}".format(value) for value in rf_cf_matrix.flatten() / np.sum(rf_cf_matrix)]
labels_rf = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names_rf, group_counts_rf, group_percentages_rf)]
labels_rf = np.asarray(labels_rf).reshape(2, 2)
sns.heatmap(rf_cf_matrix, annot=labels_rf, fmt='')
plt.text(.5, 0, "Confusion Matrix - Random Forest", ha="center", va="center", transform=plt.gcf().transFigure)
# Show the plot
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy: ", accuracy_score(y_test, lg_preditcions))