In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv(r'C:\Users\dell\Desktop\MyDocs\Docs\MK\cancer issue.csv')
df = data.copy()
df.head()

Unnamed: 0,PatientID,Age,Gender,Race/Ethnicity,BMI,SmokingStatus,FamilyHistory,CancerType,Stage,TumorSize,TreatmentType,TreatmentResponse,SurvivalMonths,Recurrence,GeneticMarker,HospitalRegion
0,1,80,Female,Other,23.3,Smoker,Yes,Breast,II,1.7,Combination Therapy,No Response,103,Yes,,South
1,2,76,Male,Caucasian,22.4,Former Smoker,Yes,Colon,IV,4.7,Surgery,No Response,14,Yes,BRCA1,West
2,3,69,Male,Asian,21.5,Smoker,Yes,Breast,III,8.3,Combination Therapy,Complete Remission,61,Yes,BRCA1,West
3,4,77,Male,Asian,30.4,Former Smoker,Yes,Prostate,II,1.7,Radiation,Partial Remission,64,No,KRAS,South
4,5,89,Male,Caucasian,20.9,Smoker,Yes,Lung,IV,7.4,Radiation,No Response,82,Yes,KRAS,South


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17686 entries, 0 to 17685
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PatientID          17686 non-null  int64  
 1   Age                17686 non-null  int64  
 2   Gender             17686 non-null  object 
 3   Race/Ethnicity     17686 non-null  object 
 4   BMI                17686 non-null  float64
 5   SmokingStatus      17686 non-null  object 
 6   FamilyHistory      17686 non-null  object 
 7   CancerType         17686 non-null  object 
 8   Stage              17686 non-null  object 
 9   TumorSize          17686 non-null  float64
 10  TreatmentType      17686 non-null  object 
 11  TreatmentResponse  17686 non-null  object 
 12  SurvivalMonths     17686 non-null  int64  
 13  Recurrence         17686 non-null  object 
 14  GeneticMarker      13360 non-null  object 
 15  HospitalRegion     17686 non-null  object 
dtypes: float64(2), int64(3

In [4]:
df.isnull().sum()

PatientID               0
Age                     0
Gender                  0
Race/Ethnicity          0
BMI                     0
SmokingStatus           0
FamilyHistory           0
CancerType              0
Stage                   0
TumorSize               0
TreatmentType           0
TreatmentResponse       0
SurvivalMonths          0
Recurrence              0
GeneticMarker        4326
HospitalRegion          0
dtype: int64

In [5]:
df[df['GeneticMarker'].isnull()].groupby(['CancerType', 'Stage']).size()

CancerType  Stage
Breast      I        196
            II       177
            III      177
            IV       152
Colon       I        191
            II       197
            III      189
            IV       172
Leukemia    I        192
            II       170
            III      158
            IV       168
Lung        I        186
            II       172
            III      209
            IV       183
Prostate    I        174
            II       208
            III      166
            IV       173
Skin        I        187
            II       165
            III      188
            IV       176
dtype: int64

In [6]:
df['GeneticMarker'] = df.groupby(['CancerType', 'Stage'])['GeneticMarker'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [7]:
df['GeneticMarker'].isnull().sum()

0

In [8]:
df = df.drop(columns=['PatientID'])

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,17686.0,53.758396,21.079473,18.0,35.0,54.0,72.0,90.0
BMI,17686.0,29.253805,6.203575,18.5,23.9,29.2,34.6,40.0
TumorSize,17686.0,5.499751,2.603107,1.0,3.3,5.5,7.7,10.0
SurvivalMonths,17686.0,60.387821,34.794859,1.0,30.0,60.0,91.0,120.0


In [10]:
cancer_type_counts = df['CancerType'].value_counts().reset_index()
cancer_type_counts.columns = ['CancerType', 'Count']  

fig = px.bar(
    cancer_type_counts,
    x='CancerType',
    y='Count',
    labels={'CancerType': 'Cancer Type', 'Count': 'Count'},
    title='Distribution of Cancer Types',
    color='CancerType',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Cancer Type', yaxis_title='Count')
fig.show(renderer='iframe')

In [11]:
fig = px.histogram(
    df,
    x='Stage',
    color='CancerType',
    barmode='group',
    title='Cancer Stages by Type',
    labels={'Stage': 'Stage', 'count': 'Count'},
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Stage', yaxis_title='Count')
fig.show(renderer='iframe')

In [12]:
fig = px.box(
    df,
    x='CancerType',
    y='BMI',
    title='BMI Distribution by Cancer Type',
    labels={'CancerType': 'Cancer Type', 'BMI': 'BMI'},
    color='CancerType',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Cancer Type', yaxis_title='BMI')
fig.show(renderer='iframe')

In [13]:
# TreatmentType ve CancerType sütunlarına göre gruplandırıp sayım yap
treatment_counts = df.groupby(['CancerType', 'TreatmentType']).size().reset_index(name='Count')

fig = px.bar(
    treatment_counts,
    x='CancerType',
    y='Count',
    color='TreatmentType',
    title='Treatment Types by Cancer Type',
    labels={'CancerType': 'Cancer Type', 'Count': 'Count', 'TreatmentType': 'Treatment Type'},
    barmode='stack',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Cancer Type', yaxis_title='Count')
fig.show(renderer='iframe')

In [14]:
fig = px.box(
    df,
    x='CancerType',
    y='Age',
    title='Age Distribution by Cancer Type',
    labels={'CancerType': 'Cancer Type', 'Age': 'Age'},
    color='CancerType',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Cancer Type', yaxis_title='Age')
fig.show(renderer='iframe')

In [15]:
fig = px.scatter(
    df,
    x='Age',
    y='BMI',
    title='BMI vs Age',
    labels={'Age': 'Age', 'BMI': 'Body Mass Index (BMI)'},
    color='CancerType',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Age', yaxis_title='BMI')
fig.show(renderer='iframe')

In [16]:
fig = px.violin(
    df,
    x='Stage',
    y='GeneticMarker',
    box=True,
    title='Genetic Marker Distribution by Stage',
    labels={'Stage': 'Stage', 'GeneticMarker': 'Genetic Marker'},
    color='Stage',
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.update_layout(xaxis_title='Stage', yaxis_title='Genetic Marker')
fig.show(renderer='iframe')

In [17]:
label_encoders = {}


categorical_columns = df.select_dtypes(include=['object']).columns


for column in categorical_columns:
   
    original_values = df[column].unique()
    
 
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    
    
    encoded_values = df[column].unique()
    
    decoded_values = label_encoders[column].inverse_transform(encoded_values)
    

    print(f"\n{column} Encoding and Decoding:")
    for enc, dec in zip(encoded_values, decoded_values):
        print(f"{enc} -> {dec}")


Gender Encoding and Decoding:
0 -> Female
1 -> Male

Race/Ethnicity Encoding and Decoding:
4 -> Other
2 -> Caucasian
1 -> Asian
3 -> Hispanic
0 -> African American

SmokingStatus Encoding and Decoding:
2 -> Smoker
0 -> Former Smoker
1 -> Non-Smoker

FamilyHistory Encoding and Decoding:
1 -> Yes
0 -> No

CancerType Encoding and Decoding:
0 -> Breast
1 -> Colon
4 -> Prostate
3 -> Lung
2 -> Leukemia
5 -> Skin

Stage Encoding and Decoding:
1 -> II
3 -> IV
2 -> III
0 -> I

TreatmentType Encoding and Decoding:
1 -> Combination Therapy
3 -> Surgery
2 -> Radiation
0 -> Chemotherapy

TreatmentResponse Encoding and Decoding:
1 -> No Response
0 -> Complete Remission
2 -> Partial Remission

Recurrence Encoding and Decoding:
1 -> Yes
0 -> No

GeneticMarker Encoding and Decoding:
2 -> KRAS
0 -> BRCA1
1 -> EGFR

HospitalRegion Encoding and Decoding:
2 -> South
3 -> West
1 -> North
0 -> East


In [18]:
X = df.drop(['Recurrence'], axis=1)  
y = df['Recurrence']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5037693177534867
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.56      0.53      2615
           1       0.51      0.45      0.48      2691

    accuracy                           0.50      5306
   macro avg       0.50      0.50      0.50      5306
weighted avg       0.50      0.50      0.50      5306

Confusion Matrix:
[[1459 1156]
 [1477 1214]]


In [21]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)


lr_model.fit(X_train, y_train)


y_pred_lr = lr_model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

Accuracy: 0.49660761402186204
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.61      0.55      2615
           1       0.50      0.38      0.43      2691

    accuracy                           0.50      5306
   macro avg       0.50      0.50      0.49      5306
weighted avg       0.50      0.50      0.49      5306

Confusion Matrix:
[[1608 1007]
 [1664 1027]]


In [22]:
dt_model = DecisionTreeClassifier(random_state=42)


dt_model.fit(X_train, y_train)


y_pred_dt = dt_model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

Accuracy: 0.5049001130795326
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.51      0.50      2615
           1       0.51      0.50      0.51      2691

    accuracy                           0.50      5306
   macro avg       0.50      0.50      0.50      5306
weighted avg       0.51      0.50      0.50      5306

Confusion Matrix:
[[1338 1277]
 [1350 1341]]


In [23]:
knn_model = KNeighborsClassifier()


knn_model.fit(X_train, y_train)


y_pred_knn = knn_model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

Accuracy: 0.5058424425179042
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50      2615
           1       0.51      0.51      0.51      2691

    accuracy                           0.51      5306
   macro avg       0.51      0.51      0.51      5306
weighted avg       0.51      0.51      0.51      5306

Confusion Matrix:
[[1320 1295]
 [1327 1364]]


In [24]:
models = ['Random Forest', 'Logistic Regression', 'Decision Tree','KNN']
accuracies = [accuracy_score(y_test, rf_model.predict(X_test)),
              accuracy_score(y_test, lr_model.predict(X_test)),
              accuracy_score(y_test, dt_model.predict(X_test)),
              accuracy_score(y_test, knn_model.predict(X_test)),
             ]


model_comparison = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies
})

print(model_comparison)

                 Model  Accuracy
0        Random Forest  0.503769
1  Logistic Regression  0.496608
2        Decision Tree  0.504900
3                  KNN  0.505842
