# Data preprocessing

In [41]:
import pandas as pd
import numpy as np

In [42]:
df_raw: pd.DataFrame = pd.read_sas('/content/LLCP2023.XPT', iterator=False, chunksize=None)
df_raw.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,_RFSEAT2,_RFSEAT3,_DRNKDRV
0,1.0,1.0,b'03012023',b'03',b'01',b'2023',1100.0,b'2023000001',2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0,1.0,1.0,9.0
1,1.0,1.0,b'01062023',b'01',b'06',b'2023',1100.0,b'2023000002',2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0
2,1.0,1.0,b'03082023',b'03',b'08',b'2023',1100.0,b'2023000003',2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0
3,1.0,1.0,b'03062023',b'03',b'06',b'2023',1100.0,b'2023000004',2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,1.0,1.0,1.0,9.0
4,1.0,1.0,b'01062023',b'01',b'06',b'2023',1100.0,b'2023000005',2023000000.0,1.0,...,7.0,1.0,47.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0


In [43]:
df_raw.shape

(433323, 350)

Dataset has 433 323 observations and 350 columns in total

## Data cleaning
The dataset contains multiple columns that are excessive for our task

### Dropping not needed columns:

In [44]:
columns_to_drop = [
    '_STATE', # State FIPS Code
    'FMONTH', # File Month
    'IDATE', # Interview Date
    'IMONTH', # Interview Month
    'IDAY', # Interview Day
    'IYEAR', # Interview Year
    'DISPCODE', # Final Disposition (interview completed fully of partially)
    'SEQNO', # Annual Sequence Number
    '_PSU', # Primary Sampling Unit (Equal to Annual Sequence Number)
    'CTELENM1', # Correct telephone number?
    'PVTRESD1', # Is this a private residence?
    'COLGHOUS', # Do you live in college housing?
    'STATERE1', # Do you currently live in  ____(state)____?
    'CELPHON1', # Is this a cell telephone?
    'LADULT1', # Are you 18 years of age or older? [NA: 344,978, Yes: 88,212, No: 133]
    'NUMADULT', # how many members of your household, including yourself, are 18 years of age or older? [NA: 344,993]
    'RESPSLC1', # The person in your household that I need to speak with...
    'LANDSEX2',
    'LNDSXBRT',
    'SAFETIME', # Is this a safe time to talk with you?
    'CTELNUM1',
    'CELLFON5', # Is this a cell phone?
    'CADULT1', # Are you 18 years of age or older?
    'CELLSEX2',
    'CELSXBRT',
    'PVTRESD3',
    'CCLGHOUS',
    'CSTATE1',
    'LANDLINE',
    'HHADULT',
    'PRIMINS1', # What is the current source of your primary health insurance? (irrelevant)
    'NUMHHOL4', # Number of landline telephones
    'NUMPHON4',
    'FLSHTMY3', # During what month and year did you receive your most recent flu vaccine
    '_STSTR',
    '_STRWT',
    '_DUALUSE',
    '_SEX', # Very close to SEXVAR variable
    '_METSTAT',
    '_URBSTAT',
    'MSCODE',
    '_WT2RAKE',
    '_LLCPWT2',
    '_RAWRAKE',
    '_CLLCPWT',
    '_DUALCOR',
    '_LLCPWT',
    'PADUR1_',
    'PADUR2_',
    'PAMIN13_',
    'PAMIN23_',
    'PA3MIN_',
    'PAVIG13_',
    'PAVIG23_',
    'PA3VIGM_',
    'HTIN4',
    'HTM4',
    'WTKG3',
    '_BMI5'
]
df_raw.drop(columns=columns_to_drop, inplace=True)

In [45]:
df_raw.shape

(433323, 291)

59 columns were dropped

### Drop columns that consist of NAs mostly:
Limit of allowed NA percentage is set to less that 60%.

In [46]:
cols_na_counts: pd.Series = df_raw.isna().sum()
cols_prim_na: pd.Series = cols_na_counts[cols_na_counts >= df_raw.shape[0] * 0.6]
cols_prim_na

Unnamed: 0,0
ASTHNOW,368957
DIABAGE4,373537
PREGNANT,357115
FALLINJ5,355584
SMOKDAY2,274684
...,...
CASTHNO2,428574
_CRACE1,382113
CAGEG,391398
_FLSHOT7,262500


In [47]:
df_raw.drop(columns=cols_prim_na.index, inplace=True)
df_raw.shape

(433323, 141)

150 columns were dropped

### Drop columns with too much skewness in data:

In [48]:
# Made exceptions for columns in category "Chronic Health Conditions" or related that might be of use in the project
cols_to_exclude = [
    'CVDINFR4', # Ever Diagnosed with Heart Attack
    'CVDCRHD4', # Ever Diagnosed with Angina or Coronary Heart Disease
    'CVDSTRK3', # Ever Diagnosed with a Stroke
    'ASTHMA3', # Ever Told Had Asthma
    'CHCSCNC1', # (Ever told) (you had) skin cancer that is not melanoma?
    'CHCOCNC1', # (Ever told) (you had) melanoma or any other types of cancer?
    'CHCCOPD3', # (Ever told) (you had) C.O.P.D. (chronic obstructive pulmonary disease), emphysema or chronic bronchitis?
    'ADDEPEV3', # (Ever told) you had a depressive disorder
    'CHCKDNY2', # Ever told you have kidney disease?
    'HAVARTH4', # Told Had Arthritis
    'DIABETE4', # (Ever told) you had diabetes
    'EXRACT12', # What type of physical activity or exercise did you spend the most time doing during the past month?
    'DRNKANY6', # Drink any alcoholic beverages in past 30 days
    'EXERANY2'
]

In [49]:
# Also exclude calculated variables, will decide for each of them separately
var_skewness = df_raw.loc[:, ~df_raw.columns.str.startswith('_')].drop(columns=cols_to_exclude).skew()
df_raw.drop(columns=var_skewness[abs(var_skewness) > 1].index, inplace=True)

In [50]:
df_raw.columns

Index(['SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'EXERANY2',
       'EXRACT12', 'EXRACT22', 'STRENGTH', 'BPHIGH6', 'CVDINFR4', 'CVDCRHD4',
       'CVDSTRK3', 'ASTHMA3', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3',
       'CHCKDNY2', 'HAVARTH4', 'DIABETE4', 'EDUCA', 'EMPLOY1', 'ALCDAY4',
       'SDLONELY', 'SDHSTRE1', '_IMPRACE', '_CHISPNC', '_RFHLTH', '_PHYS14D',
       '_MENT14D', '_HLTHPL1', '_HCVU653', '_TOTINDA', 'METVL12_', 'METVL22_',
       'ACTIN13_', 'ACTIN23_', '_MINAC12', '_MINAC22', '_PACAT3', '_PAINDX3',
       '_PA150R4', '_PA300R4', '_PA30023', '_PASTRNG', '_PAREC3', '_PASTAE3',
       '_RFHYPE6', '_CHOLCH3', '_RFCHOL3', '_MICHD', '_LTASTH1', '_CASTHM1',
       '_ASTHMS1', '_DRDXAR2', '_MRACE1', '_HISPANC', '_RACE', '_RACEG21',
       '_RACEGR3', '_RACEPRV', '_AGEG5YR', '_AGE65YR', '_AGE80', '_AGE_G',
       '_BMI5CAT', '_RFBMI5', '_CHLDCNT', '_EDUCAG', '_INCOMG1', '_SMOKER3',
       '_RFSMOK3', '_CURECI2', 'DRNKANY6', '_RFBING6', '_DRNKWK2', '_RFDRHV8'

Choose between calculated and original columns, drop not needed ones:

In [51]:
cols_with_replacement = [
    '_RFHLTH',
    '_PASTRNG',
    '_PAREC3',
    '_PASTAE3',
    'BPHIGH6',
    'CVDINFR4',
    'CVDCRHD4',
    'ASTHMA3',
    '_LTASTH1',
    '_CASTHM1',
    'HAVARTH4',
    'EDUCA',
    'ALCDAY4',
    '_CHISPNC',
    '_RFHLTH',
    '_HCVU653',
    'METVL12_',
    'METVL22_',
    '_PAREC3',
    '_PASTAE3',
    '_HISPANC',
    '_RACE',
    '_RACEG21',
    '_RACEGR3',
    '_RACEPRV',
    '_AGE65YR',
    '_AGE80',
    '_AGE_G',
    '_RFBMI5',
    '_RFSMOK3',
    '_CURECI2',
    '_RFSEAT2'
]
df_raw.drop(columns=cols_with_replacement, inplace=True)

In [52]:
df_raw.columns

Index(['SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'EXERANY2',
       'EXRACT12', 'EXRACT22', 'STRENGTH', 'CVDSTRK3', 'CHCSCNC1', 'CHCOCNC1',
       'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'EMPLOY1', 'SDLONELY',
       'SDHSTRE1', '_IMPRACE', '_PHYS14D', '_MENT14D', '_HLTHPL1', '_TOTINDA',
       'ACTIN13_', 'ACTIN23_', '_MINAC12', '_MINAC22', '_PACAT3', '_PAINDX3',
       '_PA150R4', '_PA300R4', '_PA30023', '_RFHYPE6', '_CHOLCH3', '_RFCHOL3',
       '_MICHD', '_ASTHMS1', '_DRDXAR2', '_MRACE1', '_AGEG5YR', '_BMI5CAT',
       '_CHLDCNT', '_EDUCAG', '_INCOMG1', '_SMOKER3', 'DRNKANY6', '_RFBING6',
       '_DRNKWK2', '_RFDRHV8', '_AIDTST4', '_RFSEAT3', '_DRNKDRV'],
      dtype='object')

Remaining columns and their descriptions:

- SEXVAR: Sex of Respondent
- _AGEG5YR: "Reported age in five-year age categories calculated variable"
- _IMPRACE: "Imputed race/ethnicity value"
- _MRACE1: "Calculated non-Hispanic Race including multiracial"
- _EDUCAG: "Computed level of education completed categories"
- EMPLOY1: "Employment Status"
- _INCOMG1: "Computed income categories"
- _HLTHPL1: "Have any health insurance"


Health
- GENHLTH: General health status
- PHYSHLTH: Number of Days Physical Health Not Good
- MENTHLTH: Number of Days Mental Health Not Good
- POORHLTH: "During the past 30 days, for about how many days did poor physical or mental health keep you from doing your usual activities, such as self-care, work, or recreation?"

Diseases
- CVDSTRK3: "Ever Diagnosed with a Stroke"
- CHCSCNC1: "(Ever told) (you had) skin cancer that is not melanoma?"
- CHCOCNC1: "(Ever told) (you had)  melanoma or any other types of cancer?"
- CHCCOPD3: "Ever told you had C.O.P.D. emphysema or chronic bronchitis?"
- CHCKDNY2: "Ever told you have kidney disease?"
- DIABETE4: "(Ever told) you had diabetes"
- _RFHYPE6: "High Blood Pressure Calculated Variable"
- _CHOLCH3: "Cholesterol Checked Calculated Variable"
- _RFCHOL3: "High Cholesterol Calculated Variable"
- _MICHD: "Ever had CHD or MI"
- _ASTHMS1: "Computed Asthma Status"
- _DRDXAR2: "Respondents diagnosed with arthritis"
- _BMI5CAT: "Computed body mass index categories"

Mental Health
- ADDEPEV3: "(Ever told) you had a depressive disorder"
- _MENT14D: "Computed Mental Health Status"
- SDLONELY: "How often do you feel lonely?"
- SDHSTRE1: "How often have you felt this kind of stress?"


Physical Activity
- EXERANY2: "Exercise in Past 30 Days"
- EXRACT12: "What type of physical activity or exercise did you spend the most time doing during the past month?"
- EXRACT22: "What other type of physical activity gave you the next most exercise during the past month?"
- STRENGTH: "How many times did you do physical activities or exercises to STRENGTHEN your muscles?"
- _PHYS14D: "Computed Physical Health Status"
- _TOTINDA: "Leisure Time Physical Activity Calculated Variable"
- ACTIN13_: "Estimated Activity Intensity for First Activity"
- ACTIN23_: "Estimated Activity Intensity for Second Activity"
- _MINAC12: "Minutes of Physical Activity per week for First Activity"
- _MINAC22: "Minutes of Physical Activity per week for Second Activity"
- _PACAT3: "Physical Activity Categories"
- _PAINDX3: "Physical Activity Index"
- _PA150R4: "150 Minute Physical Activity Calculated Variable"
- _PA300R4: "300 Minute Physical Activity Calculated Variable"
- _PA30023: "300 Minute Physical Activity 2-Level Calculated Variable"

Alcohol
- DRNKANY6: "Drink any alcoholic beverages in past 30 days"
- _RFBING6: "Binge Drinking Calculated Variable"
- _DRNKWK2: "Computed number of drinks of alcohol beverages per week"
- _DRNKDRV: "Drinking and Driving"
- _RFDRHV8: "Heavy Alcohol Consumption  Calculated Variable"

Other
- _SMOKER3: "Computed Smoking Status"
- _RFSEAT3: "Always Wear Seat Belts"
- _AIDTST4: "Ever been tested for HIV calculated variable"
- _CHLDCNT: "Computed number of children in household"





### Leave only columns that are related to the research topic
All remaining columns were analyzed and carefully filtered for data duplication

In [53]:
df = df_raw[['_AGEG5YR', '_ASTHMS1', '_BMI5CAT', '_CHLDCNT', '_DRDXAR2', '_EDUCAG', '_IMPRACE', '_INCOMG1', '_MENT14D', '_MICHD', '_PHYS14D', '_RFCHOL3', '_RFHYPE6', '_SMOKER3', 'ADDEPEV3', 'CHCCOPD3', 'CHCKDNY2', 'CHCOCNC1', 'CHCSCNC1', 'CVDSTRK3', 'DIABETE4', 'GENHLTH', 'POORHLTH', 'SDHSTRE1', 'SDLONELY', 'SEXVAR', '_DRNKWK2', '_TOTINDA']].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433323 entries, 0 to 433322
Data columns (total 28 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   _AGEG5YR  433323 non-null  float64
 1   _ASTHMS1  433323 non-null  float64
 2   _BMI5CAT  392788 non-null  float64
 3   _CHLDCNT  433323 non-null  float64
 4   _DRDXAR2  430763 non-null  float64
 5   _EDUCAG   433323 non-null  float64
 6   _IMPRACE  433323 non-null  float64
 7   _INCOMG1  433323 non-null  float64
 8   _MENT14D  433323 non-null  float64
 9   _MICHD    428738 non-null  float64
 10  _PHYS14D  433323 non-null  float64
 11  _RFCHOL3  381512 non-null  float64
 12  _RFHYPE6  433323 non-null  float64
 13  _SMOKER3  433323 non-null  float64
 14  ADDEPEV3  433320 non-null  float64
 15  CHCCOPD3  433320 non-null  float64
 16  CHCKDNY2  433320 non-null  float64
 17  CHCOCNC1  433320 non-null  float64
 18  CHCSCNC1  433320 non-null  float64
 19  CVDSTRK3  433319 non-null  float64
 20  DIAB

### Drop rows with NA

In [54]:
df.dropna(inplace=True)
df.shape

(105257, 28)

## Data formatting

### Replace initial values with meaningful categories

In [55]:
df['_DRNKWK21'] = df['_DRNKWK2'].apply(lambda x: 'No' if x < 0.99 else 'Yes')
df.drop(['_DRNKWK2'], axis=1, inplace=True)

In [56]:
def label_Temp(row):
   if row['POORHLTH'] == 88:
      return 'Zero days'
   if row['POORHLTH'] < 14:
      return '1-13 days'
   else : return '14+ days'

df.apply(label_Temp, axis=1)
df['POORHLTH'] = df.apply(label_Temp, axis=1)

In [57]:
sex_col = {
          1: 'Male',
          2: 'Female'
          }
df['SEXVAR'] = df['SEXVAR'].replace(sex_col)

In [58]:
hlth_col = {
          1: 'Excellent',
          2: 'Very good',
          3: 'Good',
          4: 'Fair',
          5: 'Poor'
          }
df['GENHLTH'] = df['GENHLTH'].replace(hlth_col)

In [59]:
phlth_col = {
          88: 'Zero days'
          }
df['POORHLTH'] = df['POORHLTH'].replace(phlth_col)

In [60]:
exer_col = {
          1: 'Yes',
          2: 'No'
          }
df['CVDSTRK3'] = df['CVDSTRK3'].replace(exer_col)
df['CHCCOPD3'] = df['CHCCOPD3'].replace(exer_col)
df['ADDEPEV3'] = df['ADDEPEV3'].replace(exer_col)
df['CHCKDNY2'] = df['CHCKDNY2'].replace(exer_col)
df['_MICHD'] = df['_MICHD'].replace(exer_col)
df['_DRDXAR2'] = df['_DRDXAR2'].replace(exer_col)
df['_TOTINDA'] = df['_TOTINDA'].replace(exer_col)
df['CHCSCNC1'] = df['CHCSCNC1'].replace(exer_col)
df['CHCOCNC1'] = df['CHCOCNC1'].replace(exer_col)

In [61]:
diabete_col = {
          1: 'Yes',
          2: 'Yes',
          3: 'No',
          4: 'Yes'
          }
df['DIABETE4'] = df['DIABETE4'].replace(diabete_col)

In [62]:
bmi_col = {
          1: '77',
          2: 'Normal Weight',
          3: 'Overweight',
          4: 'Obese'
          }
df['_BMI5CAT'] = df['_BMI5CAT'].replace(bmi_col)

In [63]:
sad_col = {
          1: 'Always',
          2: 'Usually',
          3: 'Sometimes',
          4: 'Rarely',
          5: 'Never'
          }
df['SDLONELY'] = df['SDLONELY'].replace(sad_col)
df['SDHSTRE1'] = df['SDHSTRE1'].replace(sad_col)

In [64]:
race_col = {
          1: 'White',
          2: 'Other',
          3: 'Other',
          4: 'Other',
          5: 'Other',
          6: 'Other'
          }
df['_IMPRACE'] = df['_IMPRACE'].replace(race_col)

In [65]:
phy_col = {
          1: 'Zero days',
          2: '1-13 days',
          3: '14+ days'
          }
df['_PHYS14D'] = df['_PHYS14D'].replace(phy_col)
df['_MENT14D'] = df['_MENT14D'].replace(phy_col)

In [66]:
hbp_col = {
          1: 'No',
          2: 'Yes'
          }
df['_RFHYPE6'] = df['_RFHYPE6'].replace(hbp_col)
df['_RFCHOL3'] = df['_RFCHOL3'].replace(hbp_col)

In [67]:
ast_col = {
          1: 'Yes',
          2: 'Yes',
          3: 'No'
          }
df['_ASTHMS1'] = df['_ASTHMS1'].replace(ast_col)

In [68]:
age_col = {
          1: 'Age 18 to 29',
          2: 'Age 18 to 29',
          3: 'Age 30 to 39',
          4: 'Age 30 to 39',
          5: 'Age 40 to 49',
          6: 'Age 40 to 49',
          7: 'Age 50 to 59',
          8: 'Age 50 to 59',
          9: 'Age 60 to 69',
          10: 'Age 60 to 69',
          11: 'Age 70+',
          12: 'Age 70+',
          13: 'Age 70+',
          14: '7'
          }
df['_AGEG5YR'] = df['_AGEG5YR'].replace(age_col)

In [69]:
chld_col = {
          1: 'No',
          2: 'Yes',
          3: 'Yes',
          4: 'Yes',
          5: 'Yes',
          6: 'Yes'
          }
df['_CHLDCNT'] = df['_CHLDCNT'].replace(chld_col)

In [70]:
edu_col = {
          1: 'No',
          2: 'High School',
          3: 'Attended College',
          4: 'College or Technical School'
          }
df['_EDUCAG'] = df['_EDUCAG'].replace(edu_col)

In [71]:
smoker_col = {
          1: 'Yes',
          2: 'Yes',
          3: 'Yes',
          4: 'No'
          }
df['_SMOKER3'] = df['_SMOKER3'].replace(smoker_col)

In [72]:
inc_col = {
          1: 'Less than $15,000',
          2: '$15,000 to $25,000',
          3: '$25,000 to $35,000',
          4: '$35,000 to $50,000',
          5: '$50,000 to $100,000',
          6: '$100,000 to $200,00',
          7: '$200,000+',
          9: '77'
          }
df['_INCOMG1'] = df['_INCOMG1'].replace(inc_col)

In [73]:
df = df[~df[['GENHLTH', 'CVDSTRK3', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'SDLONELY', 'SDHSTRE1', '_PHYS14D', '_MENT14D', '_RFHYPE6', '_RFCHOL3', '_MICHD', '_ASTHMS1', '_AGEG5YR', '_CHLDCNT', '_EDUCAG', '_SMOKER3', '_TOTINDA']].isin([7, 9, '7']).any(axis=1)]
df = df[~df[['POORHLTH', '_BMI5CAT', '_INCOMG1']].isin([77, 99, '77']).any(axis=1)]

In [74]:
df['CHCSCNC'] = np.where((df['CHCSCNC1'] == 'Yes') | (df['CHCOCNC1'] == 'Yes'), 'Yes', 'No')
df.drop(['CHCSCNC1', 'CHCOCNC1'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CHCSCNC'] = np.where((df['CHCSCNC1'] == 'Yes') | (df['CHCOCNC1'] == 'Yes'), 'Yes', 'No')


### Set all variables categorical

In [75]:
df = df.astype('category')

In [76]:
for column in df.select_dtypes(include='category').columns:
    print(df[column].astype(str).value_counts().sort_index())
    print()

_AGEG5YR
Age 18 to 29     6597
Age 30 to 39    10647
Age 40 to 49    12277
Age 50 to 59    14611
Age 60 to 69    17432
Age 70+         18883
Name: count, dtype: int64

_ASTHMS1
No     65638
Yes    14809
Name: count, dtype: int64

_BMI5CAT
Normal Weight    21291
Obese            31642
Overweight       27514
Name: count, dtype: int64

_CHLDCNT
No     58874
Yes    21573
Name: count, dtype: int64

_DRDXAR2
No     48289
Yes    32158
Name: count, dtype: int64

_EDUCAG
Attended College               22478
College or Technical School    37399
High School                    17209
No                              3361
Name: count, dtype: int64

_IMPRACE
Other    19445
White    61002
Name: count, dtype: int64

_INCOMG1
$100,000 to $200,00    17697
$15,000 to $25,000      7479
$200,000+               5891
$25,000 to $35,000      8769
$35,000 to $50,000     10829
$50,000 to $100,000    25031
Less than $15,000       4751
Name: count, dtype: int64

_MENT14D
1-13 days    37356
14+ days     17740
Zero d

In [77]:
df.shape

(80447, 27)

In [78]:
for column in df.select_dtypes(include='category').columns:
    print(df[column].astype(str).value_counts().sort_index())
    print()

_AGEG5YR
Age 18 to 29     6597
Age 30 to 39    10647
Age 40 to 49    12277
Age 50 to 59    14611
Age 60 to 69    17432
Age 70+         18883
Name: count, dtype: int64

_ASTHMS1
No     65638
Yes    14809
Name: count, dtype: int64

_BMI5CAT
Normal Weight    21291
Obese            31642
Overweight       27514
Name: count, dtype: int64

_CHLDCNT
No     58874
Yes    21573
Name: count, dtype: int64

_DRDXAR2
No     48289
Yes    32158
Name: count, dtype: int64

_EDUCAG
Attended College               22478
College or Technical School    37399
High School                    17209
No                              3361
Name: count, dtype: int64

_IMPRACE
Other    19445
White    61002
Name: count, dtype: int64

_INCOMG1
$100,000 to $200,00    17697
$15,000 to $25,000      7479
$200,000+               5891
$25,000 to $35,000      8769
$35,000 to $50,000     10829
$50,000 to $100,000    25031
Less than $15,000       4751
Name: count, dtype: int64

_MENT14D
1-13 days    37356
14+ days     17740
Zero d

## Export clean dataset to CSV

In [79]:
df.describe()

Unnamed: 0,_AGEG5YR,_ASTHMS1,_BMI5CAT,_CHLDCNT,_DRDXAR2,_EDUCAG,_IMPRACE,_INCOMG1,_MENT14D,_MICHD,...,CVDSTRK3,DIABETE4,GENHLTH,POORHLTH,SDHSTRE1,SDLONELY,SEXVAR,_TOTINDA,_DRNKWK21,CHCSCNC
count,80447,80447,80447,80447,80447,80447,80447,80447,80447,80447,...,80447,80447,80447,80447,80447,80447,80447,80447,80447,80447
unique,6,2,3,2,2,4,2,7,3,2,...,2,2,5,3,5,5,2,2,2,2
top,Age 70+,No,Obese,No,No,College or Technical School,White,"$50,000 to $100,000",1-13 days,No,...,No,No,Good,Zero days,Sometimes,Rarely,Female,Yes,Yes,No
freq,18883,65638,31642,58874,48289,37399,61002,25031,37356,72336,...,76336,64198,27418,41737,23894,27433,45650,59195,43694,65724


In [80]:
df.to_csv('LLCP2023_clean.csv', index=False)