# Feature Engineering
- Age Categoy
- BMI Category - Underweight, Normal, Overweight, Obese
- Pollution Risk Score - Location and Air Pollution Level
- Smoking Status Encoding
- Interaction Features
- Location Encoding

In [1]:
# import the libraries needed
import pandas as pd
import numpy as np

In [2]:
# load the data
df = pd.read_csv("../../data/copd_data.csv")

In [3]:
df.sample(8)

Unnamed: 0,Age,Gender,Smoking_Status,Duration_Of_Smoking,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
1260,36,Male,Never,0,0,0,1,19.59,Nepalgunj,240,0,0
869,31,Male,Former,6,0,0,1,25.6,Lalitpur,240,0,0
1219,66,Male,Current,49,1,0,0,29.52,Butwal,83,0,1
823,33,Female,Former,8,1,1,1,22.49,Chitwan,165,0,0
682,38,Male,Never,0,0,1,1,23.57,Kathmandu,143,0,0
1764,38,Female,Former,13,0,1,1,21.65,Butwal,166,1,0
261,51,Male,Never,0,1,0,1,25.66,Biratnagar,57,1,0
85,58,Female,Former,33,0,0,0,27.75,Pokhara,95,0,1


In [4]:
# Age categories
df["Age_Category"] = pd.cut(df["Age"], bins=[29, 39, 49, 59, 69, 79, 89], labels=["30-39", "40-49", "50-59", "60-69", "70-79", "80-89"])

# BMI categories
df["BMI_Categories"] = pd.cut(df["BMI"], bins=[0, 18.5, 24.5, 29.9, 35], labels=["Underweight", "Normal", "Overweight", "Obese"])

In [5]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Duration_Of_Smoking,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories
0,49,Female,Current,32,0,1,0,19.75,Butwal,98,1,1,40-49,Normal
1,47,Female,Never,0,0,0,1,19.42,Nepalgunj,206,1,0,40-49,Normal
2,75,Male,Former,50,0,1,1,22.97,Hetauda,229,1,1,70-79,Normal
3,40,Female,Former,15,1,0,1,26.44,Butwal,209,1,0,40-49,Overweight
4,36,Female,Never,0,1,0,1,27.64,Dharan,239,1,0,30-39,Overweight


In [6]:
# pollution risk score
df["Pollution_Risk_Score"] = np.where(df["Air_Pollution_Level"] > 150, 1, 0)

In [7]:
# Encode smoking status
df["Smoking_Status_encoded"] = df["Smoking_Status"].map({"Current":1, "Former":0.5, "Never": 0})

# label encoding for gender
df["Gender"] = df["Gender"].map({"Male":1, "Female": 0})

In [8]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Duration_Of_Smoking,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_Categories,Pollution_Risk_Score,Smoking_Status_encoded
0,49,0,Current,32,0,1,0,19.75,Butwal,98,1,1,40-49,Normal,0,1.0
1,47,0,Never,0,0,0,1,19.42,Nepalgunj,206,1,0,40-49,Normal,1,0.0
2,75,1,Former,50,0,1,1,22.97,Hetauda,229,1,1,70-79,Normal,1,0.5
3,40,0,Former,15,1,0,1,26.44,Butwal,209,1,0,40-49,Overweight,1,0.5
4,36,0,Never,0,1,0,1,27.64,Dharan,239,1,0,30-39,Overweight,1,0.0


In [9]:
#interaction features
df["Smoking_Pollution_Interaction"] = df["Smoking_Status_encoded"] * df["Air_Pollution_Level"]

In [10]:
# One hot encoding for the location
df = pd.get_dummies(df, columns=["Location"], drop_first = True)

## Machine Learning Data

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Age                               2000 non-null   int64   
 1   Gender                            2000 non-null   int64   
 2   Smoking_Status                    2000 non-null   object  
 3   Duration_Of_Smoking               2000 non-null   int64   
 4   Biomass_Fuel_Exposure             2000 non-null   int64   
 5   Occupational_Exposure             2000 non-null   int64   
 6   Family_History_COPD               2000 non-null   int64   
 7   BMI                               2000 non-null   float64 
 8   Air_Pollution_Level               2000 non-null   int64   
 9   Respiratory_Infections_Childhood  2000 non-null   int64   
 10  COPD_Diagnosis                    2000 non-null   int64   
 11  Age_Category                      2000 non-null   catego

In [12]:
df = df.drop(columns=["Smoking_Status", "Age_Category", "BMI_Categories"])

In [13]:
# save my engineered data
df.to_csv("../../data/engineered_copd_data.csv", index=False)