In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score, precision_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.linear_model import LogisticRegression

In [2]:
#data

df = pd.read_csv("/kaggle/input/lung-cancer/survey lung cancer.csv")
df =  df.replace((2,1),(1,0))
df = df.replace(("M","F"), (1,0))
df = df.replace(("YES","NO"), (1,0))


#standartdizing 

mx_age_scal = df["AGE"].max()

df["AGE"] = df["AGE"] / mx_age_scal






In [3]:
df[df["LUNG_CANCER"] == 0].shape[0] #without cancer

39

## Building train and test data set

In [4]:
#Let's build balanced test data set (with both classes)

df = df.sort_values(by="LUNG_CANCER").reset_index()


In [5]:
df = df[list(df.columns[2:])]

In [6]:
# test_set = pd.concat((df[:15] ,df[-15:]), axis=0)

# train_set = df[15:-15]



# #preparing for learning

# y_train = np.array(train_set["LUNG_CANCER"])
# y_test = np.array(test_set["LUNG_CANCER"])

# X_train = np.array(train_set.drop("LUNG_CANCER", axis=1))
# X_test = np.array(test_set.drop("LUNG_CANCER", axis=1))

In [7]:
X = np.array(df.drop("LUNG_CANCER", axis=1))

Y = np.array(df["LUNG_CANCER"])

X_train, X_test, y_train, y_test =  train_test_split(X,Y, test_size=0.4, random_state=39)




y_test

array([1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1])

# Building a Model

In [8]:
def evaluator(model, x_test, y_test):
    
    
    pred = model.predict(x_test)
    
    acrs = accuracy_score( y_test, pred)
    recall = recall_score(y_test, pred)
    precision = precision_score(y_test, pred)
    
    print("Model for test - ", str(model))
    print("Accuracy = ", acrs)
    print("recall = ", recall)
    print("precision = ", precision)

### 1. Logistic Regression 

In [9]:
logreg = LogisticRegression(penalty="l2")
logreg.fit(X_train,y_train)

evaluator(logreg, X_test, y_test)

Model for test -  LogisticRegression()
Accuracy =  0.8951612903225806
recall =  0.9523809523809523
precision =  0.9259259259259259


## 2. K-NN

In [10]:

knn_classifier = KNeighborsClassifier(n_neighbors=11, weights="distance")
knn_classifier.fit(X_train, y_train)

evaluator(knn_classifier, X_test, y_test)

Model for test -  KNeighborsClassifier(n_neighbors=11, weights='distance')
Accuracy =  0.8951612903225806
recall =  0.9523809523809523
precision =  0.9259259259259259


## 3. Decision Tree and Random forest

In [11]:

np.random.seed(83212) #reproduce results

dec_tree = DecisionTreeClassifier(

            criterion="gini", 
            splitter="random",
            
            class_weight={ 1: 1.2, 0: 3}
)

dec_tree.fit(X_train, y_train)

evaluator(dec_tree, X_test, y_test)

Model for test -  DecisionTreeClassifier(class_weight={0: 3, 1: 1.2}, splitter='random')
Accuracy =  0.8790322580645161
recall =  0.9428571428571428
precision =  0.9166666666666666


In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 3, 5, 10, 20,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search = GridSearchCV(DecisionTreeClassifier(class_weight={1: 1.2, 0: 3}), param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

best_tree = grid_search.best_estimator_

grid_search.best_score_

0.9575757575757576

In [13]:
evaluator(best_tree, X_test,y_test)

Model for test -  DecisionTreeClassifier(class_weight={0: 3, 1: 1.2}, criterion='entropy',
                       max_depth=20, min_samples_split=10, splitter='random')
Accuracy =  0.8790322580645161
recall =  0.9142857142857143
precision =  0.9411764705882353
