#### 1. Import Libraries : Import necessary Python libraries for data manipulation, visualization, and clustering

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler

#### 2. Load Data : Load the dataset into a pandas DataFrame.
#### 3. Data Dictionary : Load a data dictionary that describes the dataset

In [2]:
data = pd.read_csv('ML3 data.csv')
cd = pd.read_excel('Data_Description.xlsx')

In [3]:
data.head(2)

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,ar16,ar17,ar18,ar19,ar20,ar21,ar22,ar23,ar24,ar25
0,73557,8,2,1,69,,4,4,1,,...,1.470207,4.864849,3.636711,-9.174492,-9.35341,5.153847,1.294983,-1.015693,-8.671692,5.423061
1,73558,8,2,1,54,,3,3,1,,...,-7.363791,-2.566326,0.776046,-2.951925,-0.538597,3.77029,6.711471,3.521875,-1.765809,-5.383746


In [4]:
cd.head(2)

Unnamed: 0,Variable Name,Variable Description,Data File Name,Data File Description,Begin Year,EndYear,Component,Use Constraints
0,SEQN,Respondent sequence number.,DEMO_H,Demographic Variables and Sample Weights,2013,2014,Demographics,
1,SDDSRVYR,Data release cycle,DEMO_H,Demographic Variables and Sample Weights,2013,2014,Demographics,


#### 4. Initial Data Exploration : Check the shape of the DataFrame and the number of missing values.

In [5]:
cd['Component'].unique()

array(['Demographics', 'Dietary', 'Laboratory'], dtype=object)

In [6]:
dieteryColumns = cd[cd['Component'] == 'Dietary']["Variable Name"]

In [7]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9813 entries, 0 to 9812
Columns: 662 entries, SEQN to ar25
dtypes: float64(641), int64(21)
memory usage: 49.6 MB
None


#### 5. Data Cleaning : Fill missing values in dietary columns with zeros based on the assumption that they are question answers.

In [8]:
for col in dieteryColumns :
    #print(data[col].isnull().sum())
    data[col].fillna(0, inplace = True)

#### 6. Demographic Data Cleaning : Fill missing values in demographic columns with zeros.

In [9]:
demographicsColumns = cd[cd['Component'] == 'Demographics']["Variable Name"]
for col in demographicsColumns :
    data[col].fillna(0, inplace = True)

#### 7. Laboratory Data Cleaning : Fill missing values in laboratory columns with zeros.

In [10]:
labColumns = cd[cd['Component'] == 'Laboratory']["Variable Name"]
for col in labColumns :
    data[col].fillna(0, inplace = True)

#### 8. Final Missing Value Handling : Fill any remaining missing values in the dataset with zeros.

In [11]:
data.fillna(0, inplace = True)

#### 9. Feature Selection : Select numerical features for clustering and drop non-essential columns.

In [12]:
cd[cd['Component'] != 'Dietary'][cd['Component'] != 'Demographics'][cd['Component'] != 'Laboratory']

Unnamed: 0,Variable Name,Variable Description,Data File Name,Data File Description,Begin Year,EndYear,Component,Use Constraints


In [13]:
print(data.shape)
print(cd.shape)

(9813, 662)
(636, 8)


In [14]:
reqColumns = pd.concat([dieteryColumns, demographicsColumns, labColumns], axis=0)
print(reqColumns)

47       WTDRD1
48       WTDR2D
49     DR1DRSTZ
50     DR1EXMER
51        DRABF
         ...   
631    URDUURLC
632     URXPREG
633      URXUAS
634      LBDB12
635    LBDB12SI
Name: Variable Name, Length: 636, dtype: object


In [18]:
features = reqColumns.dropna().astype(str).str.strip().tolist()
features = [col for col in features if col in data.columns]
new_data = data.drop(columns=features)

In [19]:
new_data.head(2)

Unnamed: 0,URXUCR.x,WTSAF2YR.x,WTSA2YR.x,URXUCR.y,WTSB2YR.x,PHAFSTHR.x,PHAFSTMN.x,LBDRPCR.x,LBDRHP.x,LBDRLP.x,...,ar16,ar17,ar18,ar19,ar20,ar21,ar22,ar23,ar24,ar25
0,39.0,0.0,0.0,0.0,0.0,3.0,47.0,0.0,0.0,0.0,...,1.470207,4.864849,3.636711,-9.174492,-9.35341,5.153847,1.294983,-1.015693,-8.671692,5.423061
1,50.0,0.0,0.0,0.0,0.0,3.0,14.0,1.0,1.0,1.0,...,-7.363791,-2.566326,0.776046,-2.951925,-0.538597,3.77029,6.711471,3.521875,-1.765809,-5.383746


In [20]:
data = data[features]

In [21]:
print(data.shape)

(9813, 636)
(636, 8)


#### 10. Data Transformation : Scale the numerical features using MinMaxScaler

In [26]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)