### Lung Cancer

In [34]:
# import modules
import pandas as pd

# train-test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  # suppport vector machine
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
df= pd.read_csv('../datasets/lung_cancer.csv')

df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [36]:
df.shape

(309, 16)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [38]:
df.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [39]:
# Encoding
le = LabelEncoder()
df['GENDER_le']=le.fit_transform(df['GENDER'])
df['target']=le.fit_transform(df['LUNG_CANCER'])

df.head()
#M-1,F-0
#YES-1,No-0

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER,GENDER_le,target
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES,1,1
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES,1,1
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO,0,0
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO,1,0
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO,0,0


In [42]:
input_df = df.drop(['GENDER','LUNG_CANCER','target'],axis=1)
input_df.head()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,GENDER_le
0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,63,2,2,2,1,1,1,1,1,2,1,1,2,2,1
4,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [44]:
target_df= df['target']
target_df.head()

0    1
1    1
2    0
3    0
4    0
Name: target, dtype: int32

ML Training

In [45]:
X_train, X_test, y_train, y_test = train_test_split(input_df, target_df, test_size=0.33)

In [48]:
# various model
# various models
models = [('Logistic Reg.',LogisticRegression()),
          ('SVM',SVC()),
          ('Decision Tree',DecisionTreeClassifier()),
          ('Random Forest',RandomForestClassifier())]

In [49]:
results=[]

for name,model in models:

    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    results.append((name,score))

results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[('Logistic Reg.', 0.8823529411764706),
 ('SVM', 0.8235294117647058),
 ('Decision Tree', 0.8725490196078431),
 ('Random Forest', 0.8823529411764706)]

In [None]:
# Do Hyperparameter training later

In [51]:
import joblib
model = RandomForestClassifier().fit(X_train,y_train)
joblib.dump(model,'lung_cancer')

['lung_cancer']

In [53]:
mj = joblib.load('lung_cancer')
mj.score(X_test,y_test)

0.8921568627450981

In [54]:
input_df.head()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,GENDER_le
0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,63,2,2,2,1,1,1,1,1,2,1,1,2,2,1
4,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [57]:
AGE=70
SMOKING=1
YELLOW_FINGERS=1
ANXIETY=2
PEER_PRESSURE=1
CHRONIC=1
FATIGUE=2
ALLERGY=2
WHEEZING=1
ALCOHOL=1
COUGH=2
BREATH=1
SWALLOW=1
CHEST=2
GENDER=2

res =[AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC,FATIGUE,ALLERGY,WHEEZING,ALCOHOL,COUGH,BREATH,SWALLOW,CHEST,GENDER]
ans = mj.predict([res])[0]

print("YES") if ans==1 else print("NO")

YES


