# 1. Pre Processing

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [3]:
# Load the dataset
data = pd.read_csv('/Users/khalidhameed/Downloads/survey lung cancer.csv')

In [4]:
# Step 1: Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64


In [5]:
# Step 2: Encoding categorical variables
# Initialize LabelEncoder
label_encoder = LabelEncoder()

In [6]:
# Encode 'GENDER' and 'LUNG_CANCER' columns
data['GENDER'] = label_encoder.fit_transform(data['GENDER'])
data['LUNG_CANCER'] = label_encoder.fit_transform(data['LUNG_CANCER'])

In [7]:
# Step 3: Scaling numerical features
# Initialize StandardScaler
scaler = StandardScaler()

In [8]:

# Scale the 'AGE' column
data['AGE'] = scaler.fit_transform(data[['AGE']])

In [9]:
# Display the first few rows of the processed data
print("Processed Data Sample:\n", data.head())

Processed Data Sample:
    GENDER       AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0       1  0.771850        1               2        2              1   
1       1  1.381829        2               1        1              1   
2       0 -0.448107        1               1        1              2   
3       1  0.039876        2               2        2              1   
4       0  0.039876        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAI

# 2. Apply any unsupervised learning techniques

In [10]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

### K-Means Clustering

In [11]:
# Step 1: K-Means Clustering
# Define K-Means with 3 clusters (arbitrary for now; can be adjusted based on validation)
kmeans = KMeans(n_clusters=6, random_state=0)
data['KMeans_Cluster'] = kmeans.fit_predict(data.drop(columns=['LUNG_CANCER']))

### Hierarchical Clustering

In [12]:
# Step 2: Hierarchical Clustering
# Define Agglomerative Clustering with 3 clusters
hierarchical = AgglomerativeClustering(n_clusters=6)
data['Hierarchical_Cluster'] = hierarchical.fit_predict(data.drop(columns=['LUNG_CANCER']))

In [13]:
# Step 3: Clustering Validation
# Calculate Silhouette Scores
silhouette_kmeans = silhouette_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'LUNG_CANCER']), data['KMeans_Cluster'])
silhouette_hierarchical = silhouette_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'LUNG_CANCER']), data['Hierarchical_Cluster'])

In [14]:
# Calculate Davies-Bouldin Scores
davies_bouldin_kmeans = davies_bouldin_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'LUNG_CANCER']), data['KMeans_Cluster'])
davies_bouldin_hierarchical = davies_bouldin_score(data.drop(columns=['KMeans_Cluster', 'Hierarchical_Cluster', 'LUNG_CANCER']), data['Hierarchical_Cluster'])


In [15]:
# Display results
print("Clustering Validation Results:")
print(f"Silhouette Score (K-Means): {silhouette_kmeans}")
print(f"Silhouette Score (Hierarchical): {silhouette_hierarchical}")
print(f"Davies-Bouldin Index (K-Means): {davies_bouldin_kmeans}")
print(f"Davies-Bouldin Index (Hierarchical): {davies_bouldin_hierarchical}")

Clustering Validation Results:
Silhouette Score (K-Means): 0.13213241195703554
Silhouette Score (Hierarchical): 0.1254105322587935
Davies-Bouldin Index (K-Means): 1.9635057163176286
Davies-Bouldin Index (Hierarchical): 2.0098360991400503


# 3. Feature Selection Techniques

In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2

In [17]:

# Separate features and target variable
X = data.drop(columns=['LUNG_CANCER', 'KMeans_Cluster', 'Hierarchical_Cluster'])
y = data['LUNG_CANCER']

### Applying PCA

In [18]:
# Step 1: Apply PCA
# PCA requires scaled data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
# Define PCA with the goal of retaining 95% of the variance
pca = PCA(n_components=0.95, random_state=0)
X_pca = pca.fit_transform(X_scaled)

In [20]:
# Get the number of components selected by PCA to reach 95% variance
pca_components = X_pca.shape[1]
print(f"PCA Number of Components: {pca_components}")

PCA Number of Components: 13


### Applying Chi-Square

In [21]:
# Step 2: Apply Chi-Square Test
# Use MinMaxScaler to ensure non-negative values for Chi-Square
X_non_negative = MinMaxScaler().fit_transform(X)

In [22]:
# Apply Chi-Square to select top 5 features
chi2_selector = SelectKBest(chi2, k=5)
X_chi2 = chi2_selector.fit_transform(X_non_negative, y)

In [23]:
# Get selected feature names
chi2_selected_features = X.columns[chi2_selector.get_support(indices=True)]
print("Chi-Square Selected Features:", chi2_selected_features.tolist())

Chi-Square Selected Features: ['ALLERGY ', 'WHEEZING', 'ALCOHOL CONSUMING', 'COUGHING', 'SWALLOWING DIFFICULTY']


# 4. supervised learning classifiers

In [24]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [27]:
# Define feature set and target variable
X_supervised = data.drop(columns=['LUNG_CANCER'])  # Drop the target column to create X
y = data['LUNG_CANCER']  # Define the target variable

# Impute missing values with mean (for numerical) and mode (for categorical)
X_imputed = X_supervised.apply(lambda col: col.fillna(col.mean()) if col.dtype != 'object' else col.fillna(col.mode()[0]))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=0)


### Logistic regression 

In [28]:
log_reg = LogisticRegression(max_iter=1000, random_state=0)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("\nLogistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Precision:", precision_score(y_test, y_pred_log_reg))
print("Recall:", recall_score(y_test, y_pred_log_reg))
print("F1-Score:", f1_score(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


Logistic Regression
Accuracy: 0.8924731182795699
Precision: 0.8953488372093024
Recall: 0.9871794871794872
F1-Score: 0.9390243902439024
Confusion Matrix:
 [[ 6  9]
 [ 1 77]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.40      0.55        15
           1       0.90      0.99      0.94        78

    accuracy                           0.89        93
   macro avg       0.88      0.69      0.74        93
weighted avg       0.89      0.89      0.88        93



### Random Forest 

In [29]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1-Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest
Accuracy: 0.8494623655913979
Precision: 0.8809523809523809
Recall: 0.9487179487179487
F1-Score: 0.9135802469135802
Confusion Matrix:
 [[ 5 10]
 [ 4 74]]
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.33      0.42        15
           1       0.88      0.95      0.91        78

    accuracy                           0.85        93
   macro avg       0.72      0.64      0.67        93
weighted avg       0.83      0.85      0.83        93



### Support Vector Machine

In [30]:
svm = SVC(probability=True, random_state=0)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\nSupport Vector Machine (SVM)")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1-Score:", f1_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


Support Vector Machine (SVM)
Accuracy: 0.8494623655913979
Precision: 0.8478260869565217
Recall: 1.0
F1-Score: 0.9176470588235294
Confusion Matrix:
 [[ 1 14]
 [ 0 78]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.07      0.12        15
           1       0.85      1.00      0.92        78

    accuracy                           0.85        93
   macro avg       0.92      0.53      0.52        93
weighted avg       0.87      0.85      0.79        93



### K-Nearest Neighbors

In [31]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("\nK-Nearest Neighbors (KNN)")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall:", recall_score(y_test, y_pred_knn))
print("F1-Score:", f1_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))


K-Nearest Neighbors (KNN)
Accuracy: 0.8279569892473119
Precision: 0.8875
Recall: 0.9102564102564102
F1-Score: 0.8987341772151899
Confusion Matrix:
 [[ 6  9]
 [ 7 71]]
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.40      0.43        15
           1       0.89      0.91      0.90        78

    accuracy                           0.83        93
   macro avg       0.67      0.66      0.66        93
weighted avg       0.82      0.83      0.82        93



### XGBoost

In [32]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\nXGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1-Score:", f1_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost
Accuracy: 0.8709677419354839
Precision: 0.9024390243902439
Recall: 0.9487179487179487
F1-Score: 0.925
Confusion Matrix:
 [[ 7  8]
 [ 4 74]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.47      0.54        15
           1       0.90      0.95      0.93        78

    accuracy                           0.87        93
   macro avg       0.77      0.71      0.73        93
weighted avg       0.86      0.87      0.86        93



Parameters: { "use_label_encoder" } are not used.

