## All packages and libraries installed as first step

In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

## Generating dataset for cancer diagnostics randomly
##### creating 300 data records for cancer diagnosis with specific features defined. Dataset to include missing values 

In [73]:
# 1. Data Generation for 300 datasets,
# Define features and their data types
features = {
    'Age': np.random.randint(30, 80, size=300),
    'TumorSize': np.random.uniform(0.5, 10, size=300),
    'LymphNodes': np.random.randint(0, 10, size=300),
    'HormoneTherapy': np.random.choice(['Yes', 'No'], size=300),
    'Chemotherapy': np.random.choice(['Yes', 'No'], size=300),
    'Radiotherapy': np.random.choice(['Yes', 'No'], size=300),
    'CancerStage': np.random.choice(['Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'], size=300)
}

In [63]:
# Create DataFrame with the selected features for the machine learning Model
df = pd.DataFrame(features)

# Introduce missing values (randomly)to practice data processing usin for loop to iterate missing values randomly
missing_cols = ['TumorSize', 'LymphNodes']
for col in missing_cols:
    missing_indices = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
    df.loc[missing_indices, col] = np.nan

In [5]:
#dataset display 
df

Unnamed: 0,Age,TumorSize,LymphNodes,HormoneTherapy,Chemotherapy,Radiotherapy,CancerStage
0,66,6.140731,2.0,No,No,No,Stage 2
1,66,4.216513,3.0,No,Yes,No,Stage 1
2,68,5.760422,6.0,No,Yes,Yes,Stage 1
3,43,8.053018,1.0,Yes,Yes,Yes,Stage 4
4,64,8.257426,9.0,No,Yes,No,Stage 2
...,...,...,...,...,...,...,...
295,52,2.281350,7.0,No,Yes,No,Stage 1
296,78,0.779809,7.0,Yes,Yes,No,Stage 3
297,47,3.500330,4.0,No,Yes,No,Stage 4
298,55,3.784246,7.0,No,Yes,Yes,Stage 3


In [6]:
#examining data shape
df.shape

(300, 7)

In [7]:
df.describe()

Unnamed: 0,Age,TumorSize,LymphNodes
count,300.0,285.0,285.0
mean,53.983333,5.000863,4.154386
std,13.957283,2.630361,2.937858
min,30.0,0.529321,0.0
25%,43.0,2.800434,2.0
50%,55.0,4.94223,4.0
75%,66.0,7.12948,7.0
max,79.0,9.970035,9.0


In [64]:
# Create target variable for diagnositics (simplified for illustration),
# defining when a diagnositc imply cancer based on the stages status
df['Cancer'] = np.where(
    (df['CancerStage'] == 'Stage 3') | (df['CancerStage'] == 'Stage 4'), 1, 0
)

# 2. Data Preprocessing for machine learning operations,training and test dataset split
# Spliting data into 'x', and 'y', neccessary to drop target veriable in test data splits 

X = df.drop('Cancer', axis=1)
y = df['Cancer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Processing datasets for ML, grouping categorical and numeric dataset
### Applying sklearn imputer, pipline and OneHotEncoder.

In [65]:
# Define preprocessing pipelines and converting categorical variables into a format that can be provided to machine learning 
#algorithms to improve prediction. It involves creating new binary columns for each unique category in a feature. 
#Each column represents one unique category, and a value of 1 or 0 indicates 
#the presence or absence of that category

numeric_features = ['Age', 'TumorSize', 'LymphNodes']
categorical_features = ['HormoneTherapy', 'Chemotherapy', 'Radiotherapy']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [66]:
# 3. Classification Models
# Logistic Regression
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [67]:
# 4. Model Training and Evaluation
models = [logreg_pipeline]
model_names = ['Logistic Regression']

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1-score: {f1_score(y_test, y_pred)}")
    print(f"AUC: {roc_auc_score(y_test, y_pred)}")


Logistic Regression Results:
Accuracy: 0.5333333333333333
Precision: 0.5454545454545454
Recall: 0.9090909090909091
F1-score: 0.6818181818181818
AUC: 0.49158249158249157


In [70]:
#Creating a Parameter Grid for Hyperparameter Tuning in Logistic Regression
param_grid = [
    {'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__max_iter'  : [100, 200, 500],
     'classifier__solver': ['liblinear', 'sag', 'saga']
}
]
# using GridSearchCV technique to find the best possible combination of hyperparameters to improve the performance of the model -
grid_search = GridSearchCV(estimator=logreg_pipeline, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


# Evaluating the optimized model for performance improvement
y_pred_best = best_model.predict(X_test)
print("\nOptimized Log Reg Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_rf)}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf)}")
print(f"Recall: {recall_score(y_test, y_pred_best_rf)}")
print(f"F1-score: {f1_score(y_test, y_pred_best_rf)}")
print(f"AUC: {roc_auc_score(y_test, y_pred_best_rf)}")




Optimized Log Reg Results:
Accuracy: 0.55
Precision: 0.55
Recall: 1.0
F1-score: 0.7096774193548387
AUC: 0.5


#### The accuracy of the model improve to 55% from 53%, and recall improved to 100% from 99%