# Cancer

## IMPORTING PACKAGES 📦📦

In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

## DATA OVERVIEWS 🧐📊

In [119]:
data_df = pd.read_csv('data.csv')

In [120]:
data_df.head(2)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES


In [121]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [122]:
column_names = data_df.columns
column_names

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [123]:
data_df.describe()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,62.673139,1.563107,1.569579,1.498382,1.501618,1.504854,1.673139,1.556634,1.556634,1.556634,1.579288,1.640777,1.469256,1.556634
std,8.210301,0.496806,0.495938,0.500808,0.500808,0.500787,0.469827,0.497588,0.497588,0.497588,0.494474,0.480551,0.499863,0.497588
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


## ANALYZING AND TRANSFORMING THE DATA 🔄🔍

In [124]:
data_df.drop(columns=["ANXIETY"],  inplace=True)
data_df.head(2)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,2,2,2,1,1,1,2,2,2,YES


In [125]:
# rename LUNG_CANCER to target column with yes to 1 and no to 0
data_df.rename(columns={"LUNG_CANCER": "target"}, inplace=True)
data_df["target"] = data_df["target"].map({"YES": 1, "NO": 0})
data_df.head(2)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,M,69,1,2,1,1,2,1,2,2,2,2,2,2,1
1,M,74,2,1,1,2,2,2,1,1,1,2,2,2,1


In [126]:
# converting gender male to 1 and female to 0
data_df["GENDER"] = data_df["GENDER"].map({"M": 1, "F": 0})
data_df["SMOKING"] = data_df["SMOKING"].map({1: 0, 2: 1})
data_df["YELLOW_FINGERS"] = data_df["YELLOW_FINGERS"].map({1: 0, 2: 1})
data_df["PEER_PRESSURE"] = data_df["PEER_PRESSURE"].map({1: 0, 2: 1})

data_df["CHRONIC DISEASE"] = data_df["CHRONIC DISEASE"].map({1: 0, 2: 1})
data_df["FATIGUE"] = data_df["FATIGUE"].map({1: 0, 2: 1})
data_df["ALLERGY"] = data_df["ALLERGY"].map({1: 0, 2: 1})
data_df["WHEEZING"] = data_df["WHEEZING"].map({1: 0, 2: 1})

data_df["ALCOHOL CONSUMING"] = data_df["ALCOHOL CONSUMING"].map({1: 0, 2: 1})
data_df["COUGHING"] = data_df["COUGHING"].map({1: 0, 2: 1})
data_df["SHORTNESS OF BREATH"] = data_df["SHORTNESS OF BREATH"].map({
                                                                    1: 0, 2: 1})
data_df["SWALLOWING DIFFICULTY"] = data_df["SWALLOWING DIFFICULTY"].map({
                                                                        1: 0, 2: 1})
data_df["CHEST PAIN"] = data_df["CHEST PAIN"].map({1: 0, 2: 1})
data_df.head(2)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,1,69,0,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,1,1,1,0,0,0,1,1,1,1


## MODELS 🤖📉

In [127]:
x = data_df.drop(columns=['target'])
y = data_df['target']

In [128]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [129]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

In [130]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.967741935483871


In [131]:
# classification_report
print("classification_report")
print(classification_report(y_test, y_pred))

classification_report
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.98      0.98      0.98        60

    accuracy                           0.97        62
   macro avg       0.74      0.74      0.74        62
weighted avg       0.97      0.97      0.97        62



## FUNCTIONS 🔍

In [132]:
data_df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'PEER_PRESSURE',
       'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'target'],
      dtype='object')

In [133]:
# make new csv file with the new data
data_df.to_csv('new_data.csv', index=False)
print("new_data.csv created")

new_data.csv created


In [134]:
data= pd.read_csv('new_data.csv')
data.head(2)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,1,69,0,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,1,1,1,0,0,0,1,1,1,1


In [135]:
# showing number of unique values in each column with values what it is
for column in data.columns:
    print(column, data[column].unique())

GENDER [1 0]
AGE [69 74 59 63 75 52 51 68 53 61 72 60 58 48 57 44 64 21 65 55 62 56 67 77
 70 54 49 73 47 71 66 76 78 81 79 38 39 87 46]
SMOKING [0 1]
YELLOW_FINGERS [1 0]
PEER_PRESSURE [0 1]
CHRONIC DISEASE [0 1]
FATIGUE [1 0]
ALLERGY [0 1]
WHEEZING [1 0]
ALCOHOL CONSUMING [1 0]
COUGHING [1 0]
SHORTNESS OF BREATH [1 0]
SWALLOWING DIFFICULTY [1 0]
CHEST PAIN [1 0]
target [1 0]


In [136]:
x = data.drop(columns=['target'])
y = data['target']

In [137]:
model = XGBClassifier()
model.fit(x, y)
print("model created")

model created


In [138]:
def predict_new_data(data_of_new_patient):
    data_of_new_patient = np.array(data_of_new_patient).reshape(1, -1)
    prediction = model.predict(data_of_new_patient)
    return prediction[0]

In [139]:
# Example dummy data
data_of_new_patient = [1, 21, 2, 2, 1, 2, 2, 0, 2, 1, 0, 2, 1, 1]

# Call the predict_new_data function
prediction = predict_new_data(data_of_new_patient)

# Print the prediction
print("Prediction:", prediction)

Prediction: 1
