In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
data=pd.read_csv('survey_lung cancer.csv')

In [3]:
data

Unnamed: 0,GENDER,AGE,TOBACCO_SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,2,1,2,2,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,2,2,2,1,1,2,2,2,2,1,YES
305,M,70,2,1,1,1,2,2,2,2,2,2,1,2,YES
306,M,58,2,1,1,1,1,2,2,2,2,1,1,2,YES
307,M,67,2,1,1,1,2,2,1,2,2,2,1,2,YES


In [4]:
#Checking for Duplicates
data.duplicated().sum()

33

In [5]:
#Removing Duplicates
data=data.drop_duplicates()


In [6]:
data.isnull().sum()

GENDER                   0
AGE                      0
TOBACCO_SMOKING          0
YELLOW_FINGERS           0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [7]:
data=data.drop(['PEER_PRESSURE'],axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['GENDER']=le.fit_transform(data['GENDER'])
data['TOBACCO_SMOKING']=le.fit_transform(data['TOBACCO_SMOKING'])
data['YELLOW_FINGERS']=le.fit_transform(data['YELLOW_FINGERS'])
data['CHRONIC DISEASE']=le.fit_transform(data['CHRONIC DISEASE'])
data['FATIGUE ']=le.fit_transform(data['FATIGUE '])
data['ALLERGY ']=le.fit_transform(data['ALLERGY '])
data['WHEEZING']=le.fit_transform(data['WHEEZING'])
data['ALCOHOL CONSUMING']=le.fit_transform(data['ALCOHOL CONSUMING'])
data['COUGHING']=le.fit_transform(data['COUGHING'])
data['SHORTNESS OF BREATH']=le.fit_transform(data['SHORTNESS OF BREATH'])
data['SWALLOWING DIFFICULTY']=le.fit_transform(data['SWALLOWING DIFFICULTY'])
data['CHEST PAIN']=le.fit_transform(data['CHEST PAIN'])
data['LUNG_CANCER']=le.fit_transform(data['LUNG_CANCER'])

In [9]:
data

Unnamed: 0,GENDER,AGE,TOBACCO_SMOKING,YELLOW_FINGERS,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0,59,0,1,0,0,1,1,0,1,0,1,0,1
280,0,59,1,0,1,1,1,0,0,0,1,0,0,0
281,1,55,1,0,0,1,1,0,0,0,1,0,1,0
282,1,46,0,1,0,0,0,0,0,0,0,1,1,0


In [10]:
data['LUNG_CANCER'].value_counts()

1    238
0     38
Name: LUNG_CANCER, dtype: int64

In [11]:
#splitting the data into features and targets
X=data.drop('LUNG_CANCER',axis=1)
Y=data['LUNG_CANCER']

In [12]:
#Handling the imbalanced dataset
smote=SMOTE(sampling_strategy='minority',random_state=10)
X,Y=smote.fit_resample(X,Y)

In [13]:
data['LUNG_CANCER'].value_counts()

1    238
0     38
Name: LUNG_CANCER, dtype: int64

In [14]:
data

Unnamed: 0,GENDER,AGE,TOBACCO_SMOKING,YELLOW_FINGERS,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0,59,0,1,0,0,1,1,0,1,0,1,0,1
280,0,59,1,0,1,1,1,0,0,0,1,0,0,0
281,1,55,1,0,0,1,1,0,0,0,1,0,1,0
282,1,46,0,1,0,0,0,0,0,0,0,1,1,0


In [15]:
#splitting the data into train and test
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [16]:
#scaling the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [17]:
#train the model
clf=RandomForestClassifier(n_estimators=100,random_state=42)

In [18]:
model=clf.fit(X,Y)

In [19]:
#predicting the target using test data
Y_pred=model.predict(X_test)



In [20]:
class_report=classification_report(Y_test,Y_pred)
print('classification report:',class_report)

classification report:               precision    recall  f1-score   support

           0       0.98      0.96      0.97        52
           1       0.96      0.98      0.97        44

    accuracy                           0.97        96
   macro avg       0.97      0.97      0.97        96
weighted avg       0.97      0.97      0.97        96



In [21]:
# Evaluate the model using accuracy score
acc = accuracy_score(Y_test, Y_pred)
print("Accuracy:", acc)

Accuracy: 0.96875


In [22]:
#using Decisiontree
DT=DecisionTreeClassifier()
DT.fit(X_train,Y_train)

DecisionTreeClassifier()

In [23]:
Y_pred_DT=DT.predict(X_test)

In [24]:
cl_report=classification_report(Y_pred_DT,Y_test)
print(cl_report)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93        51
           1       0.93      0.91      0.92        45

    accuracy                           0.93        96
   macro avg       0.93      0.93      0.93        96
weighted avg       0.93      0.93      0.93        96



In [25]:
# save model
import pickle
with open('modelll.pkl','wb') as f:
    pickle.dump(model,f)