In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np

I imported the necessary libraries:

Pandas is used for data manipulation

Numpy is used to work on arrays and numerical data

In [None]:
#Import data from gdrive to colab
from google.colab import drive
drive.mount('/content/drive')

Through the google colab library use the function drive to give the notebook permission to access my files

In [None]:
#Load dataset
cancer = pd.read_csv('/content/drive/My Drive/data.csv')

Using my given variable titanic I manipulated the dataset from my Google drive to the notebook

**DATA EXPLORATION**

In [None]:
#view data
cancer.head(10)

Viewing the first 10 rows in my data

In [None]:
#Structure of the data
cancer.info()

Checking a summary of the data(How many rows in each column and their datatypes and non-null counts)

In [None]:
#View shape of data
cancer.shape

Checking the number of rows and columns in our data

In [None]:
#View elements in target variable diagnosis
cancer['diagnosis'].unique()

View elements in the the column diagnosis

In [None]:
#View datatypes
cancer.dtypes

Checking the data types of each column

In [None]:
#Check for nulls
cancer.isnull().sum()

Check for nulls in the dataset and remove through data cleaning

In [None]:
#Check for duplicates
cancer.duplicated().sum()

Check if the data had duplicates in which case there wasn't

In [None]:
#View statistical data
cancer.describe()

View a summary of the statistics of the data

In [None]:
#Categorizing columns
categorical_columns = []
non_categorical_columns = []
for column in cancer.columns:
  if cancer[column].dtype == 'object':
    categorical_columns.append(column)
  else:
    non_categorical_columns.append(column)
print("Categorical columns")
print(categorical_columns)
print("Non_categorical columns")
print(non_categorical_columns)

Grouping our data to check for categorical and non categorical dtypes

**DATA CLEANING**

Since there are nulls in column "Unnamed: 32", clean the data by removing the nulls

In [None]:
#View sum of nulls in Unnamed: 32
cancer['Unnamed: 32'].isnull().sum()

In [None]:
#Remove nulls
cancer['Unnamed: 32'] = cancer['Unnamed: 32'].fillna(0)
cancer['Unnamed: 32'].isnull().sum()

In [None]:
#Remove outliers
#list of columns to check outliers
columns = [ 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']
#create a funtion to clean outliers
def clean_outliers(column):
  mean = cancer[column].mean()
  std = cancer[column].std()
  threshold = 3
  lower_limit = mean - threshold * std
  upper_limit = mean + threshold * std
  return cancer[(cancer[column] >= lower_limit) & (cancer[column] <= upper_limit)]

for column in columns:
   new_cancer = clean_outliers(column)
new_cancer.shape

In [None]:
cancer.shape

The data had no outliers since the shape of the data remained the same after determining our threshold

**DATA PREPROCESSING**
Feature engineering

Transform raw data from our data to features usable in machine learning.

Encode the relevant data to be used

In [None]:
#Import libraries for encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
#Encoding the relevant categorical columns
#encoding diagnosis
cancer['diagnosis'].unique()
cancer['diagnosis'] = cancer['diagnosis'].map({'M':0,'B':1})
print(cancer['diagnosis'].unique())

**MODEL BUILDING**

Since diagnosis is a binary classification, we will use logistic,random forest or decision trees

**Logistic regression**

In [None]:
#Import libraries for training and testing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
#Giving x and y variables
x = cancer.drop('diagnosis',axis=1)
y = cancer['diagnosis']
#train and test the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=50)
#give the model a variable
model = LogisticRegression(max_iter=1000)
#fit the model
model.fit(x_train,y_train)
#make predictions
y_pred = model.predict(x_test)
#Get the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



*   Train the data at 80% and test it at 20%
*   Random_state is a parameter used in shuffling the data before splitting it. The more it increases the better your model performs and in this case the best RS was 50
*Give the model a variable
*Fit model and make a prediction of the outcome you need
*Get the accuracy of the prediction

Our model accuracy was 34 % meaning we can't use Logistic regression for prediction and have to use an alternative model

**Random forest**

In [None]:
#Libraries for random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
#Giving x and y variables
x = cancer.drop('diagnosis',axis=1)
y = cancer['diagnosis']
#train and test the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=50)
#give the model a variable
model = RandomForestClassifier(criterion = 'gini', max_depth = 3,random_state=0)
#fit the model
model.fit(x_train,y_train)
#make predictions
y_pred = model.predict(x_test)
#Get the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Random Forest Classifier was the most accurate model with an accuracy of 95% and now we can tune it to find a better performance of the same model.

**Model Tuning**

In [None]:
#Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
#get the param grid
param_grid = {
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}
#create a model variable
mod = RandomForestClassifier(criterion='gini',max_depth=3)
#perform grid search
grid_search = GridSearchCV(mod, param_grid, cv=5)
#fit the grid search
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
print(best_params)

* Import the gridsearchcv funtion from the sklearn model selection library which will help in finding the best parameters to use

* Make a library of the parameter of which will be used to find the best parameter
* cv stands for cross validdation in which the data will be split 5 times and trained/tested to different hyperparameters
*Fit the grid search to the data.
*After the grid search, using the function best_params_, find the best parameters in the parameter grid

In [None]:
from sklearn.metrics import roc_auc_score
#Original fit and prediction
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#name the retrained model
best_model =RandomForestClassifier(**best_params)
best_model.fit(x_train,y_train)
y_pred_best = best_model.predict_proba(x_test)[:, 1]


accuracy = accuracy_score(y_test,y_pred)

roc_auc_score = roc_auc_score(y_test, y_pred_best)

print(f"Accuracy score: {accuracy}")
print(f"ROC AUC SCORE: {roc_auc_score}")

* Import ROC AUC Score to evaluate performance of the new model
* Fit the new model to the data
* Make the prediction of the new outcome
* Get the accuracy and roc_auc_score and compare the model performance before and after tuning the model

This is the best model fit for our data since it gave us a model performance of 99.5% after tuning