In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
import os
import numpy as nump
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict

warnings.filterwarnings('ignore')

df = pd.read_csv('pancreatic.csv')

columns_to_drop = ['sample_id', 'patient_cohort', 'sample_origin', 'stage', 'benign_sample_diagnosis'] 
df.drop(columns=columns_to_drop, inplace=True)
df.dropna(axis=1, inplace=True) #this removes the column with null values
# df.dropna(axis=0, inplace=True) #this removes the row with null values
df['sex'].replace({'M': 0, 'F': 1}, inplace=True)
df
df_train, df_test = train_test_split(df, test_size=0.2)
john = df_train.loc[:, ["age","sex","creatinine","LYVE1","REG1B","TFF1"]]
john
df_train, df_test = train_test_split(df, test_size=0.2)
X_train, y_train, X_test, y_test = df_train.loc[:, ["age","sex","creatinine","LYVE1","REG1B","TFF1"]], df_train.loc[:, ["diagnosis"]], df_test.loc[:, ["age","sex","creatinine","LYVE1","REG1B","TFF1"]], df_test.loc[:, ["diagnosis"]]
y_test

from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracies = cross_val_score(model, X_train, y_train , cv = 5, scoring = "accuracy") #we can change the cv to split into as many chunks as we want
nump.mean(accuracies)
print(f"MLP Classifier cross val accuracy on predicting stage of pancreatic cancer :{nump.mean(accuracies)*100}")
print(f"MLP accuracy :{accuracy*100}")
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

df = pd.read_excel('ovarian.xlsx')

# Drop columns and handle missing values
columns_to_drop = ['SUBJECT_ID']
df.drop(columns=columns_to_drop, inplace=True)
df.dropna(axis=1, inplace=True)

# Split data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2)

# Separate features and target variable
X_train, y_train = df_train.drop(columns=["TYPE"]), df_train["TYPE"]
X_test, y_test = df_test.drop(columns=["TYPE"]), df_test["TYPE"]

# Define the MLPClassifier model
model = MLPClassifier()

# Define hyperparameters grid for tuning
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Predict on test set with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

MLP Classifier cross val accuracy on predicting stage of pancreatic cancer :54.65621500559911
MLP accuracy :47.45762711864407
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (100, 100), 'learning_rate': 'adaptive', 'solver': 'sgd'}
Best Score: 0.7456493506493506
Test Set Accuracy: 0.7714285714285715


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Data Preprocessing
data = pd.read_csv('pancreatic.csv')

# Drop the specified columns
columns_to_drop = ['sample_id', 'patient_cohort', 'sample_origin', 'stage', 'benign_sample_diagnosis']
data.drop(columns=columns_to_drop, inplace=True)

# Handle missing values if any
data.dropna(inplace=True)

# Encoding age using LabelEncoder
age_encoder = LabelEncoder()
data['age_encoded'] = age_encoder.fit_transform(data['age'])

# Encoding sex using OneHotEncoder
if len(data) > 0:  # Check if there are still samples remaining
    sex_encoder = OneHotEncoder(drop='first', sparse=False)
    sex_encoded = sex_encoder.fit_transform(data[['sex']])
    data[['sex_encoded']] = sex_encoded

    # Split data into features (X) and target variable (y)
    X = data[['age_encoded', 'sex_encoded', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A']]
    y = data['diagnosis']

    # Step 3: Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 4: Model Selection and Hyperparameter Tuning (Grid Search)
    param_grid = {
        'n_estimators': [50, 100, 150],  # Number of trees in random forest
        'max_depth': [None, 10, 20, 30],  # Maximum number of levels in tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
    }

    RF_model = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Best parameters found
    print("Best parameters found:")
    print(grid_search.best_params_)

    # Step 5: Model Training with Best Parameters
    best_RF_model = grid_search.best_estimator_
    best_RF_model.fit(X_train, y_train)

    # Step 6: Model Evaluation
    y_pred = best_RF_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Random Forest accuracy :{accuracy*100}")

    report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)

else:
    print("Not enough data after preprocessing.")


Best parameters found:
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest accuracy :78.57142857142857
Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.60      0.67         5
           2       0.67      0.29      0.40         7
           3       0.80      0.93      0.86        30

    accuracy                           0.79        42
   macro avg       0.74      0.61      0.64        42
weighted avg       0.77      0.79      0.76        42



In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
import os
import numpy as nump
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

# Assuming you have imported your model as `model`
# from sklearn or any other library
# from sklearn import load_model
# model = load_model('your_model.h5')

# Sample data provided
sample_data = [
    [33, 0.07, 1.33, 3.81, 2.50, 99.7, 83.8, 0.02, 0.40, 4.08, 0, 0.98, 0.31, 5.89, 138.2, 0.90, 195, 4.55, 14.50, 276.1],
    [50, 0.01, 0.20, 3.29, 2.44, 98.5, 47.0, 0.07, 1.60, 7.21, 1, 0.88, 0.24, 5.40, 142.5, 1.10, 284, 4.64, 12.10, 211.5],
    [28, 0.02, 0.40, 4.26, 2.55, 104.5, 57.0, 0.07, 1.50, 4.70, 0, 0.91, 0.16, 3.50, 142.6, 1.09, 279, 4.44, 12.40, 298.1],
    # Add more sample data as needed
]

# Load your model here

# Function to ask questions and get user inputs
def ask_ovarian_questions():
    age = int(input("Enter your age: "))
    baso_count = float(input("Enter your BASO# count: "))
    baso_percentage = float(input("Enter your BASO%: "))
    bun = float(input("Enter your BUN: "))
    ca = float(input("Enter your Calcium level: "))
    cl = float(input("Enter your Chloride level: "))
    crea = float(input("Enter your Creatinine level: "))
    eo_count = float(input("Enter your EO# count: "))
    eo_percentage = float(input("Enter your EO%: "))
    glu = float(input("Enter your Glucose level: "))
    # Continue with more questions based on your dataset

    # Return user inputs as a list
    return [age, baso_count, baso_percentage, bun, ca, cl, crea, eo_count, eo_percentage, glu]

#function to ask pancriatic questions
def ask_pancriatic_question():
    age_encoder = LabelEncoder()
    sex_encoder = OneHotEncoder()
    
    #['age_encoded', 'sex_encoded', 'plasma_CA19_9', 'creatinine', 'LYVE1', 'REG1B', 'TFF1', 'REG1A']
    age = int(input("Enter you age: "))
    age = np.array([age])  # Convert age to a 1-dimensional array
    age = age_encoder.fit_transform(age)
    sex = input("Enter your sex: M or F")
    sex = np.array([sex]).reshape(-1, 1)  # Reshape to a 2D array
    sex_encoder = OneHotEncoder(drop='first', sparse=False)
    sex_encoded = sex_encoder.fit_transform(sex)
    plasma_CA19_9 = float(input("Enter your plasma_CA19_9 level: "))
    creatine = float(input("Enter your creatine level: "))
    lyve1 = float(input("Enter your LYVE1 level: "))
    reg1b = float(input("Enter your REG1B level: "))
    tff1 = float(input("Enter your TFF1 level: "))
    reg1a = float(input("Enter your REG1A level: "))
    
    return [age, sex_encoded, plasma_CA19_9, creatine, lyve1, reg1b, tff1, reg1a]
# Function to predict cancer based on user inputs
def predict_ovarian_cancer(user_inputs):
    # Convert user inputs to numpy array
    user_inputs = np.array(user_inputs)
    # Reshape user inputs if needed based on your model input shape
    user_inputs = user_inputs.reshape(1, -1)
    predicted_probability = best_model.predict_proba(user_inputs)
    # Use your pre-imported model to make predictions
    # predicted_probability = model.predict(user_inputs)
    return predicted_probability


def predict_pancriatic_cancer(user_inputs):
    # Convert user inputs to numpy array
    user_inputs = np.array(user_inputs)
    # Reshape user inputs if needed based on your model input shape
    user_inputs = user_inputs.reshape(1, -1)
    predicted_probability = best_RF_model.predict_proba(user_inputs)
    # Use your pre-imported model to make predictions
    # predicted_probability = model.predict(user_inputs)
    return predicted_probability


# Main function to run the chatbot
def main():
    print("Welcome to the Cancer Prediction Chatbot!")
    print("Please answer the following questions:")
    print("Which form of cancer are you trying to predict: 1 (Ovarian) or 2 (Pancriatic)")
    cancer_type = input()
    if cancer_type == "1":
        user_inputs = ask_ovarian_questions()
        # Predict cancer based on user inputs
        predicted_probability = predict_ovarian_cancer(user_inputs)
    elif cancer_type == "2":    
        user_inputs = ask_pancriatic_question()
        predicted_probability = predict_pancriatic_cancer(user_inputs)
    # Output prediction
    if predicted_probability > 0.5:
        print("Based on your inputs, there is a high probability that you have ovarian cancer.")
    else:
        print("Based on your inputs, there is a low probability that you have ovarian cancer.")

# Run the main function
if __name__ == "__main__":
    main()


Welcome to the Cancer Prediction Chatbot!
Please answer the following questions:
Which form of cancer are you trying to predict: 1 (Ovarian) or 2 (Pancriatic)
2
Enter you age: 23
Enter your sex: M or FM
Enter your plasma_CA19_9 level: 1
Enter your creatine level: 1
Enter your LYVE1 level: 1
Enter your REG1B level: 1
Enter your TFF1 level: 1
Enter your REG1A level: 1


ValueError: setting an array element with a sequence.