In [42]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [43]:
lung_cancer_df = pd.read_csv("../data/raw/Lung_Cancer.csv")
lung_cancer_df.drop("id", axis=1, inplace=True)

In [44]:
lung_cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   age                 890000 non-null  float64
 1   gender              890000 non-null  object 
 2   country             890000 non-null  object 
 3   diagnosis_date      890000 non-null  object 
 4   cancer_stage        890000 non-null  object 
 5   family_history      890000 non-null  object 
 6   smoking_status      890000 non-null  object 
 7   bmi                 890000 non-null  float64
 8   cholesterol_level   890000 non-null  int64  
 9   hypertension        890000 non-null  int64  
 10  asthma              890000 non-null  int64  
 11  cirrhosis           890000 non-null  int64  
 12  other_cancer        890000 non-null  int64  
 13  treatment_type      890000 non-null  object 
 14  end_treatment_date  890000 non-null  object 
 15  survived            890000 non-nul

In [45]:
lung_cancer_df.head()

Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [46]:
numeric_features = [
    "age",
    "bmi",
    "cholesterol_level",
    "hypertension",
    "asthma",
    "cirrhosis",
    "other_cancer"
]

categoriocal_features = [
    "gender",
    "country",
    "cancer_stage",
    "family_history",
    "smoking_status",
    "treatment_type"
]

date_features = [
    "diagnosis_date",
    "end_treatment_date"
]

label = ["survived"]

# Processing dates

In [47]:
# Dropping dates to avoid data leakage
lung_cancer_df.drop(date_features, inplace=True, axis=1)

# Exploring categorical features and transforming

In [48]:
for cat_f in categoriocal_features:

    print(lung_cancer_df[cat_f].value_counts())
    print(len(lung_cancer_df[cat_f].unique()))
    print("")

gender
Male      445134
Female    444866
Name: count, dtype: int64
2

country
Malta             33367
Ireland           33243
Portugal          33208
France            33199
Sweden            33161
Croatia           33138
Greece            33052
Spain             33042
Netherlands       33040
Denmark           33024
Slovenia          33005
Belgium           32986
Hungary           32981
Romania           32963
Poland            32949
Italy             32948
Germany           32940
Estonia           32893
Czech Republic    32885
Lithuania         32856
Slovakia          32853
Austria           32832
Finland           32798
Luxembourg        32794
Cyprus            32719
Latvia            32565
Bulgaria          32559
Name: count, dtype: int64
27

cancer_stage
Stage III    222594
Stage IV     222527
Stage I      222516
Stage II     222363
Name: count, dtype: int64
4

family_history
No     445181
Yes    444819
Name: count, dtype: int64
2

smoking_status
Passive Smoker    223170
Never Smok

In [49]:
one_hot_encoder = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    dtype="int8",
    drop="first" # Useful to test Logistic Regression
)
lung_ohe_cat_matrix = one_hot_encoder.fit_transform(lung_cancer_df[categoriocal_features])

lung_cancer_df.drop(categoriocal_features, axis=1, inplace=True)

ohe_name_cols = one_hot_encoder.get_feature_names_out()
lung_cancer_df[ohe_name_cols] = lung_ohe_cat_matrix

# Exploring numeric variables

In [50]:
lung_cancer_df[numeric_features].describe()

Unnamed: 0,age,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer
count,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0
mean,55.007008,30.494172,233.633916,0.750024,0.46974,0.225956,0.088157
std,9.994485,8.368539,43.432278,0.432999,0.499084,0.418211,0.283524
min,4.0,16.0,150.0,0.0,0.0,0.0,0.0
25%,48.0,23.3,196.0,1.0,0.0,0.0,0.0
50%,55.0,30.5,242.0,1.0,0.0,0.0,0.0
75%,62.0,37.7,271.0,1.0,1.0,0.0,0.0
max,104.0,45.0,300.0,1.0,1.0,1.0,1.0


Variables "hypertension", "asthma", "cirrhosis" and "other_cancer" are binary

In [51]:
aditional_cat = ["hypertension", "asthma", "cirrhosis", "other_cancer"]
_ = [numeric_features.remove(ad_cat) for ad_cat in aditional_cat]

numeric_features

['age', 'bmi', 'cholesterol_level']

In [52]:
lung_cancer_df[aditional_cat] = lung_cancer_df[aditional_cat].astype("int8")

In [53]:
normalizer = StandardScaler()
lung_cancer_df[numeric_features] = normalizer.fit_transform(lung_cancer_df[numeric_features])

# Changing data type of label

In [54]:
lung_cancer_df[label] = lung_cancer_df[label].astype("int8")

# Saving

In [55]:
all_columns = list(lung_cancer_df.columns)
all_columns.remove(label[0])

In [56]:
lung_cancer_df = lung_cancer_df[label + all_columns]

In [57]:
lung_cancer_df.to_parquet("../data/processed/lung_cancer.parquet", index=False)