In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Step 2: Load the Dataset
# Replace 'cancer_data.csv' with your actual file path
file_path = 'cancer-dataset.csv'
df = pd.read_csv(file_path)

# Display dataset overview
print("Dataset Overview:")
print(df.head())

Dataset Overview:
   Age  Gender        BMI  Smoking  GeneticRisk  PhysicalActivity  \
0   58       1  16.085313        0            1          8.146251   
1   71       0  30.828784        0            1          9.361630   
2   48       1  38.785084        0            2          5.135179   
3   34       0  30.040296        0            0          9.502792   
4   62       1  35.479721        0            0          5.356890   

   AlcoholIntake  CancerHistory  Diagnosis  
0       4.148219              1          1  
1       3.519683              0          0  
2       4.728368              0          1  
3       2.044636              0          0  
4       3.309849              0          1  


In [4]:
# Step 3: Data Preprocessing
# Check for missing values
print("\nMissing Values in the Dataset:")
print(df.isnull().sum())

# Encode 'Gender' column (categorical variable)
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Define Features and Target
X = df.drop(columns=['Diagnosis'])  # Features
y = df['Diagnosis']  # Target variable

# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Missing Values in the Dataset:
Age                 0
Gender              0
BMI                 0
Smoking             0
GeneticRisk         0
PhysicalActivity    0
AlcoholIntake       0
CancerHistory       0
Diagnosis           0
dtype: int64


In [5]:
# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Step 5: Train the Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [7]:
# Step 6: Evaluate the Model
y_pred = log_reg.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       189
           1       0.80      0.77      0.79       111

    accuracy                           0.84       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.84      0.84      0.84       300


Confusion Matrix:
[[167  22]
 [ 25  86]]


In [9]:
# Step 7: Predict for New User Input
print("\n--- Cancer Detection Prediction ---")
print("Enter the following details to predict cancer diagnosis:")

# Collect user input
user_input = {}
user_input['Age'] = float(input("Age: "))
user_input['Gender'] = input("Gender (Male/Female): ")
user_input['BMI'] = float(input("BMI: "))
user_input['Smoking'] = int(input("Smoking Status (0 = No, 1 = Yes): "))
user_input['GeneticRisk'] = int(input("Genetic Risk Level (0 = Low, 1 = Medium, 2 = High): "))
user_input['PhysicalActivity'] = int(input("Physical Activity Hours/Week: "))
user_input['AlcoholIntake'] = int(input("Alcohol Intake (0 = No, 1 = Yes): "))
user_input['CancerHistory'] = int(input("Family Cancer History (0 = No, 1 = Yes): "))

# Encode Gender input
user_input['Gender'] = label_encoder.transform([user_input['Gender']])[0]

# Convert input into DataFrame and scale
user_data = pd.DataFrame([user_input])
user_data_scaled = scaler.transform(user_data)

# Make prediction
prediction = log_reg.predict(user_data_scaled)

# Output Prediction Result
if prediction[0] == 1:
    print("\nPrediction: The patient is at risk of cancer (Positive Diagnosis).")
else:
    print("\nPrediction: The patient is NOT at risk of cancer (Negative Diagnosis).")


--- Cancer Detection Prediction ---
Enter the following details to predict cancer diagnosis:

Prediction: The patient is NOT at risk of cancer (Negative Diagnosis).
