In [1]:
%%writefile Final_Project.py
# Importing Libraries
import streamlit as st
from streamlit_option_menu import option_menu
from PIL import Image
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, validation_curve, learning_curve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
import pandas as pd
import numpy as np

# SETTING PAGE CONFIGURATIONS
icon= "https://seeklogo.com/images/B/breast-cancer-care-logo-A97235C6B9-seeklogo.com.png" 
st.set_page_config(
                   page_title="Breast Cancer Prediction",
                   page_icon=icon,
                   layout="wide",
                   initial_sidebar_state="expanded")

# Create a container for the icon and title
icon_url = "https://image.pngaaa.com/253/5401253-middle.png"
title = "It is a fact that prevention is better than cure, and the same is true for Cancer"
st.markdown(
    f'<div><img src="{icon_url}" alt="Icon" style="height: 100px; margin-right: 10px;">{title}</div>',
    unsafe_allow_html=True
)

#Title
st.title("Breast Cancer Prediction")

# Reading Data and Knowing the Primaries
df = pd.read_csv(r'cancer.csv')

# Check for missing values in the DataFrame
print(df.isnull().sum())

# Verify the unique values in the 'diagnosis' column
print(df['diagnosis'].unique())

# Data Preprocessing
# Separate features and target
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Imbalance check and Balancing
ros = RandomOverSampler(sampling_strategy="not majority")
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

# Check the number of features in the original data
print("Number of features in original data:", X.shape[1])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Check the number of features in the training data after scaling
print("Number of features in training data after scaling:", X_train_scaled.shape[1])

# Scale the testing data using the same scaler fitted on the training data
X_test_scaled = scaler.transform(X_test)

# Check the number of features in the testing data after scaling
print("Number of features in testing data after scaling:", X_test_scaled.shape[1])
        

def ensemble_model(X_res, y_res, X_test_scaled, y_test):

    # Ensembling with Random Forest
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_classifier.fit(X_res, y_res)
    # Predictions using Random Forest
    y_pred_rf = rf_classifier.predict(X_test_scaled)
    
    # Ensembling with Gradient Boosting
    gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=0)
    gb_classifier.fit(X_res, y_res)
    # Predictions using Gradient Boosting
    y_pred_gb = gb_classifier.predict(X_test_scaled)
    
    # Map 'M' to 1 and 'B' to 0 in the predicted arrays
    y_pred_rf[y_pred_rf == 'M'] = 1
    y_pred_rf[y_pred_rf == 'B'] = 0
    y_pred_gb[y_pred_gb == 'M'] = 1
    y_pred_gb[y_pred_gb == 'B'] = 0

    # Convert predictions to integers
    y_pred_rf = y_pred_rf.astype(int)
    y_pred_gb = y_pred_gb.astype(int)

    # Ensemble prediction using voting (majority vote)
    y_pred_ensemble = np.round((y_pred_rf + y_pred_gb) / 2).astype(int)
    
    # Map 'M' to 1 and 'B' to 0 in y_test
    y_test[y_test == 'M'] = 1
    y_test[y_test == 'B'] = 0
    y_test = y_test.astype(int)

    # Evaluate the Ensemble model
    accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
    conf_matrix_ensemble = confusion_matrix(y_test, y_pred_ensemble)
    classification_report_ensemble = classification_report(y_test, y_pred_ensemble)

    print(f'Ensemble Accuracy: {accuracy_ensemble}')
    print(f'Ensemble Confusion Matrix:\n{conf_matrix_ensemble}')
    print(f'Ensemble Classification Report:\n{classification_report_ensemble}')
    
    return accuracy_ensemble, conf_matrix_ensemble, classification_report_ensemble

# Main function to create the Streamlit app
# Add a menu to navigate different sections of the app

# OPTION MENUS
with st.sidebar:
    selected = option_menu("Menu", ["Home", "Exploratory Data Analysis (EDA)", "Data Preprocessing",  "Validation Curve", "Learning Curve", "Other Metrics"],
                          default_index=0,
                          orientation="vertical",
                          styles={"nav-link": {"font-size": "20px", "text-align": "centre", "margin": "0px", 
                                                "--hover-color": "#FF69B4"},
                                   "icon": {"font-size": "20px"},
                                   "container" : {"max-width": "4000px"},
                                   "nav-link-selected": {"background-color": "#FF69B4"}})

# Home
if selected == "Home":
        # Title Image
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', use_column_width=True)
        st.write("With being the most common type of cancer in women, breast cancer accounts for 14% of cancers in Indian women."
                 " It is reported that with every four minutes, an Indian woman is diagnosed with breast cancer."
                 "A 2018 report of Breast Cancer statistics recorded 1,62,468 new registered cases and 87,090 reported deaths." 
                 "As the most common cancer type in Indian women, women in their early thirties till fifties are at considerable risk to develop breast cancer, and the incidence risk increases till its peak by the time they reach 50-64 years of age."
                 "Breast cancer is a treatable disease and chances of survival are higher if it’s detected in time."
                 "The survival rates of breast cancer in India are low because the detection takes place late"
                 " The only way to do so is by being aware of how it can be detected and early diagnosis can be done. " )
        st.write("# Prevention is a step away from cure. Be aware, be prepared.")

# EDA Section
elif selected == "Exploratory Data Analysis (EDA)":
        st.set_option('deprecation.showPyplotGlobalUse', False)
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', use_column_width=True)
        
        # Custom color palette for the pair plot
        custom_palette = {'B': 'purple', 'M': '#FF6EFF'}
        pairplot=sns.pairplot(df, hue = 'diagnosis', vars = ['radius_mean','texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean'], palette=custom_palette, hue_order=['B', 'M'])
        # Set the title on top of the pair plot
        pairplot.fig.suptitle("Pairplot", y=1.02, fontsize=20, color='black')
        st.pyplot()
       
        # Distribution of Diagnosis
        #Reduce the size of the plot
        plt.figure(figsize=(6, 4), dpi=100)
        sns.countplot(data=df, x='diagnosis',palette=custom_palette, hue_order=['B', 'M'])
        plt.title("Distribution of Diagnosis", fontsize=20)
        st.pyplot()

        # Label encode the "diagnosis" column to convert 'M' and 'B' to numeric values
        label_encoder = LabelEncoder()
        df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

        # Correlation Heatmap
        plt.figure(figsize=(10, 8), dpi=100)
        sns.heatmap(df.corr(), annot=True, cmap='RdPu', fmt=".2f")
        plt.title("Correlation Heatmap", fontsize=20)
        st.pyplot()
        
# Data Preprocessing    
elif selected == "Data Preprocessing":
        st.set_option('deprecation.showPyplotGlobalUse', False)
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', width=900)
        
        # Separate features and target
        X = df.drop(['id', 'diagnosis'], axis=1)
        y = df['diagnosis']
        
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

        # Scaling the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Check the unique values in y_train
        #st.write(np.unique(y_train))

        # Convert y_train into a pandas Series
        y_train_series = pd.Series(y_train)

        # Map 'M' to 'Malignant' and 'B' to 'Benign'
        y_train_series = y_train_series.map({1: 'Malignant', 0: 'Benign'})

        # Check the value counts in y_train
        value_counts=y_train_series.value_counts()
        
        st.subheader("Distribution of Diagnosis in Training Set")

        # Visualize the distribution of classes in the training set using a pie chart
        # Custom color palette for the pair plot
        custom_palette = {'B': 'purple', 'M': '#FF6EFF'}
        plt.figure(figsize=(2, 2), dpi=100)
        plt.pie(value_counts,labels=value_counts.index,autopct = '%.2f', colors=custom_palette.values(),)
        st.pyplot()

# Validation Curve Section
elif selected == "Validation Curve":
        st.set_option('deprecation.showPyplotGlobalUse', False)
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', width=900)
        # Model Building
        classifier = SVC(kernel="rbf")
        classifier.fit(X_res, y_res)
    
        # Cross Validation
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
        param_range = np.logspace(-3, 3, 7)
        train_scores, val_scores = validation_curve(classifier, X_res, y_res, param_name="C", param_range=param_range, cv=cv)

        # validation curve plot
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        val_scores_mean = np.mean(val_scores, axis=1)
        val_scores_std = np.std(val_scores, axis=1)
        
        st.subheader("Validation Curve")
        
        plt.figure(figsize=(6, 4))
        plt.plot(param_range, train_scores_mean, label='Training score', color='purple')
        plt.plot(param_range, val_scores_mean, label='Validation score', color='#FF6EFF')
        plt.fill_between(x=param_range, y1=train_scores_mean - train_scores_std, y2=train_scores_mean + train_scores_std, alpha=0.2)
        plt.fill_between(x=param_range, y1=val_scores_mean - val_scores_std, y2=val_scores_mean + val_scores_std, alpha=0.2)

        plt.xscale('log')
        plt.ylim(0.5, 1.1)
        plt.xlabel('Parameter C')
        plt.ylabel('Accuracy')
        plt.legend(loc='best')
        plt.grid(True)
        st.pyplot()
        
        if np.max(val_scores_mean) == 1.0:
            st.write("Overfitting")
        elif np.min(val_scores_mean) <= 0.5:
            st.write("Underfitting")
        else:
            st.write("No significant overfitting or underfitting")

# Learning Curve Section
elif selected == "Learning Curve": 
        st.set_option('deprecation.showPyplotGlobalUse', False)
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', width=900)
        # Model Building
        classifier = SVC(kernel="rbf")
        classifier.fit(X_res, y_res)
    
        # Learning Curve
        kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
        train_sizes, train_scores, val_scores = learning_curve(classifier, X_res, y_res, cv=kfold, train_sizes=np.linspace(0.1, 1.0, 10))

        # Learning Curve Plot
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        val_scores_mean = np.mean(val_scores, axis=1)
        val_scores_std = np.std(val_scores, axis=1)
        
        st.subheader("Learning Curve")
        
        plt.figure(figsize=(6, 4))
        plt.plot(train_sizes, train_scores_mean, label='Training score', color='purple')
        plt.plot(train_sizes, val_scores_mean, label='Validation score', color='#FF6EFF')
        plt.fill_between(x = train_sizes, y1 = train_scores_mean - train_scores_std, y2 = train_scores_mean + train_scores_std, alpha=0.2)
        plt.fill_between(x = train_sizes, y1 = val_scores_mean - val_scores_std, y2 = val_scores_mean + val_scores_std, alpha=0.2)

        plt.xlabel('Training Set Size')
        plt.ylabel('Accuracy')
        plt.ylim(0.5, 1.1)
        plt.title('Learning Curve', fontsize=30)
        plt.legend(loc='best')
        plt.grid(True)
        st.pyplot()

        if np.max(val_scores_mean) == 1.0:
            st.write("Overfitting")
        elif np.min(val_scores_mean) <= 0.5:
            st.write("Underfitting")
        else:
            st.write("No significant overfitting or underfitting")

# Other Metrics Section
elif selected == "Other Metrics":
        icon= Image.open('C:/Users/ADMIN/output/Pictures/GITHUB screenshots/breast cancer.PNG')
        st.image(icon, caption=' Breast Cancer ', width=900)
        # Model Building
        classifier = SVC(kernel="rbf")
        classifier.fit(X_res, y_res)
        
        # Other Accuracy Metrics After Prediction
        y_pred = classifier.predict(X_test_scaled)
        
        st.subheader("Other Metrics")

        st.write(f'\nAccuracy Score: {accuracy_score(y_test, y_pred)}', end = f"\n{'_'*60}\n\n")
        st.write(f'Confusion Matrix:\n\n {confusion_matrix(y_test, y_pred)}', end = f"\n{'_'*60}\n\n")
        st.write(f'Classification Report:\n\n {classification_report(y_test, y_pred)}')

        # Calculate Precision, Recall, and Specificity
        precision = precision_score(y_test, y_pred, pos_label='M')
        recall = recall_score(y_test, y_pred, pos_label='M')
        specificity = recall_score(y_test, y_pred, pos_label='B')

        st.write(f'Precision: {precision}')
        st.write(f'Recall (Sensitivity): {recall}')
        st.write(f'Specificity: {specificity}')

        # Calculate AUC ROC
        y_probs = classifier.decision_function(X_test_scaled)
        roc_auc = roc_auc_score(y_test, y_probs)

        st.write(f'AUC ROC: {roc_auc}')
                 
        # Ensemble_model
        # Call the ensemble_model() function to evaluate the ensemble model
        accuracy_ensemble, conf_matrix_ensemble, classification_report_ensemble = ensemble_model(X_res, y_res, X_test_scaled, y_test)

        # Display Ensemble model results
        st.subheader("Ensemble Model Metrics")
        st.write(f"Ensemble Accuracy: {accuracy_ensemble}")
        st.write("Ensemble Confusion Matrix:")
        st.write(conf_matrix_ensemble)
        st.write("Ensemble Classification Report:")
        st.write(classification_report_ensemble)         

       





Overwriting Final_Project.py


In [2]:
%run Final_Project.py

2023-07-28 12:46:07.002 
  command:

    streamlit run Final_Project.py [ARGUMENTS]
2023-07-28 12:46:07.027 Session state does not function when running a script without `streamlit run`


id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64
['M' 'B']
Number of features in original data: 30
Number of