In [5]:
import seaborn as sns
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load dataset from the Excel file
file_path = './content/cancer patient data sets.xlsx'
df = pd.read_excel(file_path)

label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Level'] = label_encoder.fit_transform(df['Level'])

X = df.drop(columns=['Patient Id', 'Level'])  # Drop non-predictive columns
y = df['Level']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 100.00%


In [11]:
# export this model to pickle
with open('cancer_prediction_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [12]:
questions = [
    "What is your age?",
    "What is your gender? (1: Female, 2: Male)",
    "Rate the air pollution in your area (1 to 10):",
    "Rate your alcohol consumption (1 to 10):",
    "Rate the severity of your dust allergy (1 to 10):",
    "Rate your exposure to occupational hazards (1 to 10):",
    "Rate your genetic risk of cancer (1 to 10):",
    "Rate the severity of your chronic lung disease (1 to 10):",
    "Rate how balanced your diet is (1 to 10):",
    "Rate your obesity level (1 to 10):",
    "Rate the level of smoking (1 to 10):",
    "Rate the passive smoker risk (1 to 10):",
    "Rate your chest pain severity (1 to 10):",
    "Rate your coughing blood level (1 to 10):",
    "Rate your fatigue level (1 to 10):",
    "Rate your weight loss severity (1 to 10):",
    "Rate your shortness of breath severity (1 to 10):",
    "Rate your wheezing level (1 to 10):",
    "Rate the swallowing difficulty (1 to 10):",
    "Rate the clubbing of your finger nails (1 to 10):",
    "Rate the frequency of colds (1 to 10):",
    "Rate your dry cough level (1 to 10):",
    "Rate your snoring severity (1 to 10):"
]


def cancer_prediction():
    # Open cancer_prediction_model.pkl into variable model
    model = None
    
    try:
        with open('cancer_prediction_model.pkl', 'rb') as file:
            model = pickle.load(file)
    except FileNotFoundError:
        print("No model detected")
        return
        
    if model == None:
        print("No model detected")
        return
        
    print("\nPlease answer the following questions:")
    user_input = []
    for question in questions:
        answer = float(input(question))
        user_input.append(answer)

    # Preprocess user input
    user_input_scaled = scaler.transform([user_input])  # Match user input to dataset features

    # Make a prediction
    prediction = model.predict(user_input_scaled)
    predicted_level = label_encoder.inverse_transform(prediction)

    print(f"\nPredicted Cancer Risk Level: {predicted_level[0]}")
    
    return predicted_level[0]