In [1]:
#import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from queue import PriorityQueue
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Data preprocessing
# Referencing processed datasets delivered by the IoT
data = pd.read_csv('/kaggle/input/survey-lung-cancer/survey lung cancer.csv')
data = pd.get_dummies(data) 

# Update target variable column names and feature sets (use 'LUNG_CANCER_YES' and exclude 'LUNG_CANCER_NO' and 'LUNG_CANCER_YES'）
lung = data['LUNG_CANCER_YES']
results_remove = data.drop(['LUNG_CANCER_NO', 'LUNG_CANCER_YES'], axis=1)

# Split the dataset into training and testing sets
train_X, test_X, train_Y, test_Y = train_test_split(results_remove, lung, test_size=0.3, random_state=42)

In [3]:
# Determining the optimal number of data splits using information entropy
# The information entropy is calculated for a given set of labels.
def Entropy(L): 
    unique, counts = np.unique(L, return_counts=True)
    P = counts / len(L)
    Entropy1 = -np.sum(P * np.log2(P))
    return Entropy1

In [4]:
# Create decision trees that are as clear and accurate as possible
# Calculate the information gain gained when splitting the dataset from a parent collection to a child collection.
def Information_Gain(set1, subsets): 
    parent1 = Entropy(set1)
    
    totalnum = 0
    
    # Loop through each subset in subsets
    for subset in subsets:
        length = len(subset)
        totalnum += length
        
    subsets_E = 0
    for subset in subsets:
        weight = len(subset) / totalnum
        subsets_E = Entropy(subset)
        weight_Entropy = weight * subset_entropy
        subsets_E += weight_Entropy
    return parent1 - subsets_E

In [5]:
# Further evaluation of individual features
# Calculate the information gain of individual features for constructing decision trees
def heuristic(X, Y, feature): 
    entropytotal = Entropy(Y)
    values = X[feature].unique()
    entropyweight = 0
    for value in values:
        subset = Y[X[feature] == value]
        entropyweight += len(subset) / len(Y) * Entropy(subset)
    gain = entropytotal - entropyweight
    return gain

In [6]:
# Greedy best first search heuristic method
# Selection of the most valuable features for the prediction of the target variable
def feature_search(X, Y, max_NUM=10):
    select_features = []
    unselect_features = list(X.columns)
    # Store heuristic scores for each feature
    hscores = {} 

    while unselect_features and len(select_features) < max_NUM:
        max_gain = -np.inf
        best_feature = None
        for feature in unselect_features:
            score = heuristic(X, Y, feature)
            # Scoring of storage features
            hscores[feature] = score 
            if score > max_gain:
                max_gain = score
                best_feature = feature
                

        # Adding highly rated features
        if best_feature is not None:
            select_features.append(best_feature)
            unselect_features.remove(best_feature)
        else:
            break
    return select_features, hscores

select_features, hscores = feature_search(train_X, train_Y)

In [7]:
# Print each feature score and output selected features
print("\nHeuristic Scores of all features:")
for feature, score in hscores.items():
    print(f"Feature: {feature}, Heuristic Score: {score}")
    
print("Selected Features:", select_features)


Heuristic Scores of all features:
Feature: AGE, Heuristic Score: 0.12428794613327071
Feature: SMOKING, Heuristic Score: 0.0035433753074698426
Feature: YELLOW_FINGERS, Heuristic Score: 0.025725018307391934
Feature: ANXIETY, Heuristic Score: 0.013166615362250944
Feature: PEER_PRESSURE, Heuristic Score: 0.018017177185530642
Feature: CHRONIC DISEASE, Heuristic Score: 0.015498586756286237
Feature: FATIGUE , Heuristic Score: 0.029308568277442548
Feature: ALLERGY , Heuristic Score: 0.0909855403564831
Feature: WHEEZING, Heuristic Score: 0.044045908235476605
Feature: ALCOHOL CONSUMING, Heuristic Score: 0.06471857111480639
Feature: COUGHING, Heuristic Score: 0.044088201324953946
Feature: SHORTNESS OF BREATH, Heuristic Score: 0.010875877587066984
Feature: SWALLOWING DIFFICULTY, Heuristic Score: 0.05721567388839843
Feature: CHEST PAIN, Heuristic Score: 0.020827781789933053
Feature: GENDER_F, Heuristic Score: 0.0003879059472988722
Feature: GENDER_M, Heuristic Score: 0.0003879059472988722
Selected 

In [8]:
# Identification of target variables and selected characteristics
select_features.append('LUNG_CANCER_YES')
data = data[select_features]

# Define target and characteristic variables
Y = data['LUNG_CANCER_YES']   
X = data.drop('LUNG_CANCER_YES', axis=1)   

# Split the dataset into training and testing sets
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3, random_state=42)

In [9]:
# Train Random Forest Classifier
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(train_X, train_Y)

# Calculate feature importance based on node impurity (Gini significance score)
Importance = np.zeros(X.shape[1])

In [10]:
# Predictions with the Random Forest Classifier
# Each tree is independent of the other
for tree in RF.estimators_: 
    important_tree = np.zeros(X.shape[1])
    
    # Iterate through each tree, compute significance and accumulate
    for i in range(tree.tree_.node_count): 
        Wq = tree.tree_.weighted_n_node_samples[i]
        Dq = tree.tree_.impurity[i]
        
        left = tree.tree_.children_left[i]
        right = tree.tree_.children_right[i]
        
        if left >= 0:
            W_left = tree.tree_.weighted_n_node_samples[left]
            D_left = tree.tree_.impurity[left]
        else:
            W_left = 0
            D_left = 0
        
        if right >= 0:
            W_right = tree.tree_.weighted_n_node_samples[right]
            D_right = tree.tree_.impurity[right]
        else:
            W_right = 0
            D_right = 0
        
        result2 = Wq * Dq - W_left * D_left - W_right * D_right
        
        index = tree.tree_.feature[i]
        if index != -2:   
            important_tree[index] += result2
     
    sum_tree = np.sum(important_tree)
    if sum_tree != 0:
        important_tree /= sum_tree
    else:
        important_tree /= 1
    
    Importance += important_tree

# Normalize feature importances across all trees
sum_importance = np.sum(Importance)
if sum_importance != 0:
    Importance /= sum_importance
else:
    Importance /= 1

# The output of feature importances
print("Feature Importance:")
for feature, impo in zip(X.columns, Importance):
    print(f"Feature: {feature}, Importance: {impo}")

Feature Importance:
Feature: AGE, Importance: 0.316032201744364
Feature: ALLERGY , Importance: 0.10632711871157229
Feature: ALCOHOL CONSUMING, Importance: 0.09178541933725666
Feature: SWALLOWING DIFFICULTY, Importance: 0.08657481382996672
Feature: COUGHING, Importance: 0.06227888855763088
Feature: WHEEZING, Importance: 0.05097122031815375
Feature: FATIGUE , Importance: 0.08591277408417752
Feature: YELLOW_FINGERS, Importance: 0.06931090403884421
Feature: CHEST PAIN, Importance: 0.05736205547578245
Feature: PEER_PRESSURE, Importance: 0.07344460390225134


In [11]:
# Prediction using models
predict1 = RF.predict(test_X)
ACC = accuracy_score(test_Y, predict1)
print(f"Accuracy: {ACC}")

# Print performance results
PRE = classification_report(test_Y, predict1, zero_division=0)
print("Classification Report:")
print(PRE)

Accuracy: 0.956989247311828
Classification Report:
              precision    recall  f1-score   support

       False       0.71      0.71      0.71         7
        True       0.98      0.98      0.98        86

    accuracy                           0.96        93
   macro avg       0.85      0.85      0.85        93
weighted avg       0.96      0.96      0.96        93

