# Load libraries, set constants

In [1]:
from pathlib import Path

import PyPDF2
import tabula
import re
import numpy as np
import pandas as pd

In [2]:
#PROCESSED_FILE_PATH_PARQUET = 'CDC_2022_BRFSS_Survey.parquet'
PROCESSED_FILE_PATH_CSV = 'CDC_2022_BRFSS_Survey.csv'

In [26]:
# Change the file path
#RAW_FILE_PATH = 'Raw_Survey_Data_2022/LLCP_2022.XPT'
RAW_FILE_PATH = '/home/georgematlis/AUTH/DWS/ADVANCED_TOPICS_IN_MACHINE_LEARNING/CDC-BRFSS/Raw_Survey_Data_2022/LLCP_2022.XPT'

PDF_CODEBOOK_FILE = 'Survey_Data_2022_Codebook/Survey_Data_2022_Codebook.pdf'

VARIABLE_LIST_PATH = Path('Variable_List_with_Descriptions.txt')

FINAL_FILE = 'CDC_2022_BRFSS_Survey_No_NaNs.csv'

## Convert SAS to PARQUET

In [20]:
def sas_converter(path: str, dest_path: str, file_format: str):
    assert file_format in ['csv', 'parquet']
    
    try:
        df = pd.read_sas(path, encoding='utf-8')
    except Exception as e:
        print('Not a valid SAS file.')
        print(e)
    else:
        if file_format == 'csv':
            df.to_csv(dest_path, index=False)
        elif file_format == 'parquet':
            df.to_parquet(dest_path, index=False)

In [24]:
#sas_converter(path=RAW_FILE_PATH, dest_path=PROCESSED_FILE_PATH_PARQUET, file_format='parquet')
sas_converter(path=RAW_FILE_PATH, dest_path=PROCESSED_FILE_PATH_CSV, file_format='csv')

# Data Pre-Processing

## Information Extraction from CDC Codebook

In [None]:
def extract_text_after_keywords(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Extract text from each page
        text = ''
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text()
            
    # Use regular expressions to find text after specific keywords
    label_matches = re.findall(r'Label: (.+)', text)
    sas_variable_matches = re.findall(r'SAS Variable Name: (.+)', text)
    question_matches = re.findall(r'Question: (.+)', text)
    section_matches = re.findall(r'Section Name: (.+)', text)
    
    # Extracted text after keywords
    labels = [match.strip() for match in label_matches]
    sas_variable_names = [match.strip().replace(" ","") for match in sas_variable_matches]
    questions = [match.strip() for match in question_matches]
    sections = [match.strip() for match in section_matches]

    return labels, sas_variable_names, questions, sections

# Example usage
labels, sas_variable_names, questions, sections = extract_text_after_keywords(PDF_CODEBOOK_FILE)

In [8]:
print("Labels:")
print(labels)

print("\nSAS Variable Names:")
print(sas_variable_names)

print("\nQuestions:")
print(questions)

print("\nSection Names:")
print(sections)

Labels:
['State FIPS Code', 'File Month', 'Interview Date', 'Interview Month', 'Interview Day', 'Interview Year', 'Final Disposition', 'Annual Sequence Number', 'Primary Sampling Unit', 'Correct telephone number?', 'Private Residence?', 'Do you live in college housing?', 'Resident of State', 'Cellular Telephone', 'Are you 18 years of age or older?', 'Are you male or female?', 'Number of Adults in Household', 'Are you male or female?', 'Number of Adult men in Household', 'Number of Adult women in Household', 'Respondent selection', 'Safe time to talk', 'Correct Phone Number?', 'Is this a cell phone?', 'Are you 18 years of age or older?', 'Are you male or female?', 'Do you live in a private residence?', 'Do you live in college housing?', 'Do you currently live in  ____(state)____?', 'Do you also have a landline telephone?', 'Number of Adults in Household', 'Sex of Respondent', 'General Health', 'Number of Days Physical Health Not Good', 'Number of Days Mental Health Not Good', 'Poor Phys

In [9]:
# Create a dictionary with the lists
variable_info = {
    'Label': labels,
    'Variable Name': sas_variable_names,
    'Question': questions,
    'Section': sections
}

In [10]:
var_info_df = pd.DataFrame(variable_info)

In [11]:
var_info_df.head()

Unnamed: 0,Label,Variable Name,Question,Section
0,State FIPS Code,_STATE,State FIPS Code,Record Identification
1,File Month,FMONTH,File Month,Record Identification
2,Interview Date,IDATE,Interview Date,Record Identification
3,Interview Month,IMONTH,Interview Month,Record Identification
4,Interview Day,IDAY,Interview Day,Record Identification


## Read, Overview data

In [46]:
#data = pd.read_parquet(PROCESSED_FILE_PATH_PARQUET)
data = pd.read_csv(PROCESSED_FILE_PATH_CSV)

In [6]:
# Create a dictionary with SAS variable names as keys and corresponding label/question as values
sas_variable_dict = dict(zip(sas_variable_names, zip(labels, questions)))

print(f'Number of variables in dictionary: {len(sas_variable_dict)}')

no_existing_cols = 0
# Loop through DataFrame columns
for col in data.columns:
    if col in sas_variable_dict:
        no_existing_cols += 1
        sas_variable_label, sas_variable_question = sas_variable_dict[col]
        print(f"{col}, {sas_variable_label}, {sas_variable_question}")

print(f'Number of existing columns in dataframe: {no_existing_cols}')

Number of variables in dictionary: 324
_STATE, State FIPS Code, State FIPS Code
FMONTH, File Month, File Month
IDATE, Interview Date, Interview Date
IMONTH, Interview Month, Interview Month
IDAY, Interview Day, Interview Day
IYEAR, Interview Year, Interview Year
DISPCODE, Final Disposition, Final Disposition
SEQNO, Annual Sequence Number, Annual Sequence Number
_PSU, Primary Sampling Unit, Primary Sampling Unit (Equal to Annual Sequence Number)
CTELENM1, Correct telephone number?, Is this     (phone number)     ?
PVTRESD1, Private Residence?, Is this a private residence?   [READ ONL Y IF NECESSAR Y: � By private residence, we mean someplace like a house or apartment. � ]
COLGHOUS, Do you live in college housing?, Do you live in college housing?
STATERE1, Resident of State, Do you currently live in  ____(state)____?
CELPHON1, Cellular Telephone, Is this a cell telephone?
LADULT1, Are you 18 years of age or older?, Are you 18 years of age or older?
COLGSEX1, Are you male or female?, Are 

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Columns: 328 entries, _STATE to _AIDTST4
dtypes: float64(323), object(5)
memory usage: 1.1+ GB


The dataset has:
- 445132 samples/instances
- 328 variables
- 323 numerical variables. Categorical variables can be converted to numerical variables, e.g Yes=1, No=0
- 5 variables as strings

In [25]:
data.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_SMOKGRP,_LCSREC,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,2032022,2,3,2022,1100.0,2022000001,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,1.0,2042022,2,4,2022,1100.0,2022000002,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0
2,1.0,1.0,2022022,2,2,2022,1100.0,2022000003,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0
3,1.0,1.0,2032022,2,3,2022,1100.0,2022000004,2022000000.0,1.0,...,3.0,2.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,9.0,9.0,2.0
4,1.0,1.0,2022022,2,2,2022,1100.0,2022000005,2022000000.0,1.0,...,4.0,,1.0,10.0,1.0,140.0,1.0,,,2.0


## Extracting the list of variables

In [26]:
# These variable names are the ones used in the Kaggle dataset. 
"""
NEW_VAR_NAMES = [
    "State",
    "Sex",
    "General_Health",
    "Physical_Health_Days",
    "Mental_Health_Days",
    "Last_Checkup_Time",
    "Physical_Activities",
    "Sleep_Hours",
    "Removed_Teeth",
    "Had_Heart_Attack",
    "Had_Angina",
    "Had_Stroke",
    "Had_Asthma",
    "Had_Skin_Cancer",
    "Had_COPD",
    "Had_Depressive_Disorder",
    "Had_Kidney_Disease",
    "Had_Arthritis",
    "Had_Diabetes",
    "Deaf_Or_Hard_Of_Hearing",
    "Blind_Or_Vision_Difficulty",
    "Difficulty_Concentrating",
    "Difficulty_Walking",
    "Difficulty_Dressing_Bathing",
    "Difficulty_Errands",
    "Smoker_Status",
    "E_Cigarette_Usage",
    "Chest_Scan",
    "Race_Ethnicity_Category",
    "Age_Category",
    "Height_In_Meters",
    "Weight_In_Kilograms",
    "BMI",
    "Alcohol_Drinkers",
    "HIV_Testing",
    "Flu_Vax_Last_12",
    "Pneumo_Vax_Ever",
    "Tetanus_Last_10Tdap",
    "High_Risk_Last_Year",
    "Covid_Pos"
]
"""

In [47]:
# These variable names are identical as the ones above but, instead of using 'Had_Heart_Attck', we use 'Had_CHD_MI'.
NEW_VAR_NAMES = [
    "State",
    "Sex",
    "General_Health",
    "Physical_Health_Days",
    "Mental_Health_Days",
    "Last_Checkup_Time",
    "Physical_Activities",
    "Sleep_Hours",
    "Removed_Teeth",
    "Had_CHD_MI",
    "Had_Stroke",
    "Had_Asthma",
    "Had_Skin_Cancer",
    "Had_COPD",
    "Had_Depressive_Disorder",
    "Had_Kidney_Disease",
    "Had_Arthritis",
    "Had_Diabetes",
    "Deaf_Or_Hard_Of_Hearing",
    "Blind_Or_Vision_Difficulty",
    "Difficulty_Concentrating",
    "Difficulty_Walking",
    "Difficulty_Dressing_Bathing",
    "Difficulty_Errands",
    "Smoker_Status",
    "E_Cigarette_Usage",
    "Chest_Scan",
    "Race_Ethnicity_Category",
    "Age_Category",
    "Height_In_Meters",
    "Weight_In_Kilograms",
    "BMI",
    "Alcohol_Drinkers",
    "HIV_Testing",
    "Flu_Vax_Last_12",
    "Pneumo_Vax_Ever",
    "Tetanus_Last_10Tdap",
    "High_Risk_Last_Year",
    "Covid_Pos"
]

In [48]:
variable_list_df = pd.read_csv(VARIABLE_LIST_PATH, sep=' - ', header=None, names=['Variable', 'Description'])

  variable_list_df = pd.read_csv(VARIABLE_LIST_PATH, sep=' - ', header=None, names=['Variable', 'Description'])


In [49]:
variable_list_df

Unnamed: 0,Variable,Description
0,_STATE,State FIPS Code
1,SEXVAR,Sex of Respondent
2,GENHLTH,Would you say that in general your health is:
3,PHYSHLTH,"Now thinking about your physical health, which..."
4,MENTHLTH,"Now thinking about your mental health, which i..."
5,CHECKUP1,About how long has it been since you last visi...
6,EXERANY2,"During the past month, other than your regular..."
7,SLEPTIM1,"On average, how many hours of sleep do you get..."
8,RMVTETH4,Not including teeth lost for injury or orthodo...
9,CVDSTRK3,(Ever told) (you had) a stroke.


In [50]:
variable_list = variable_list_df['Variable'].to_numpy()

In [51]:
variable_list

array(['_STATE', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'CHECKUP1',
       'EXERANY2', 'SLEPTIM1', 'RMVTETH4', 'CVDSTRK3', 'ASTHMA3',
       'CHCSCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2', 'HAVARTH4',
       'DIABETE4', 'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES',
       'DIFFALON', '_SMOKER3', 'ECIGNOW2', 'LCSCTSC1', '_RACEGR4',
       '_AGEG5YR', 'HTM4', 'WTKG3', '_BMI5', '_MICHD', 'DRNKANY6',
       '_AIDTST4', 'FLUSHOT7', 'PNEUVAC4', 'TETANUS1', 'HIVRISK5',
       'COVIDPOS'], dtype=object)

In [52]:
# Check if all column names are present in the DataFrame
if all(col in data.columns for col in variable_list):
    print("All column names exist in the DataFrame.")
else:
    # Identify which column names are missing
    missing_columns = [col for col in variable_list if col not in data.columns]
    print(f"The following column(s) are missing: {missing_columns}")

All column names exist in the DataFrame.


In [53]:
data = data[variable_list]

In [56]:
data.shape

(445132, 39)

In [57]:
data.dropna().shape

(364044, 39)

In [58]:
data.columns = NEW_VAR_NAMES

In [61]:
data['Had'].value_counts()

Had_Diabetes
2.0    385539
1.0     38946
7.0      1246
9.0       757
Name: count, dtype: int64

In [21]:
STATE = {
    1: "Alabama",
    2: "Alaska",
    4: "Arizona",
    5: "Arkansas",
    6: "California",
    8: "Colorado",
    9: "Connecticut",
    10: "Delaware",
    11: "District of Columbia",
    12: "Florida",
    13: "Georgia",
    15: "Hawaii",
    16: "Idaho",
    17: "Illinois",
    18: "Indiana",
    19: "Iowa",
    20: "Kansas",
    21: "Kentucky",
    22: "Louisiana",
    23: "Maine",
    24: "Maryland",
    25: "Massachusetts",
    26: "Michigan",
    27: "Minnesota",
    28: "Mississippi",
    29: "Missouri",
    30: "Montana",
    31: "Nebraska",
    32: "Nevada",
    33: "New Hampshire",
    34: "New Jersey",
    35: "New Mexico",
    36: "New York",
    37: "North Carolina",
    38: "North Dakota",
    39: "Ohio",
    40: "Oklahoma",
    41: "Oregon",
    42: "Pennsylvania",
    44: "Rhode Island",
    45: "South Carolina",
    46: "South Dakota",
    47: "Tennessee",
    48: "Texas",
    49: "Utah",
    50: "Vermont",
    51: "Virginia",
    53: "Washington",
    54: "West Virginia",
    55: "Wisconsin",
    56: "Wyoming",
    66: "Guam",
    72: "Puerto Rico",
    78: "Virgin Islands"
}

SEX = {1: 'Male', 2: 'Female'}

GEN_HEALTH = {
    1: "Excellent",
    2: "Very good",
    3: "Good",
    4: "Fair",
    5: "Poor"
}

PHYS_MEN_HEALTH = {77: np.nan,
               88: 0,
               99: np.nan
                  }

LAST_CHECKUP = {
    1: "Within past year (anytime less than 12 months ago)",
    2: "Within past 2 years (1 year but less than 2 years ago)",
    3: "Within past 5 years (2 years but less than 5 years ago)",
    4: "5 or more years ago"
}

YES_NO_QUESTIONS = {1: 'Yes', 2: 'No'}

SLEEP_TIME = lambda x: np.where(x > 24, np.nan, x)

TEETH_REMOVED = {
    1: "1 to 5",
    2: "6 or more, but not all",
    3: "All",
    8: "None of them"
}

DIABETES = {
    1: "Yes",
    2: "Yes, but only during pregnancy (female)",
    3: "No",
    4: "No, pre-diabetes or borderline diabetes",
}

SMOKER_STATUS = {
    1: "Current smoker - now smokes every day",
    2: "Current smoker - now smokes some days",
    3: "Former smoker",
    4: "Never smoked"
}

ECIGARETTES = {
    1: "Never used e-cigarettes in my entire life",
    2: "Use them every day",
    3: "Use them some days",
    4: "Not at all (right now)"
}

RACE = {
    1: "White only, Non-Hispanic",
    2: "Black only, Non-Hispanic",
    3: "Other race only, Non-Hispanic",
    4: "Multiracial, Non-Hispanic",
    5: "Hispanic"
}

AGE_CATEGORY = {
    1: "Age 18 to 24",
    2: "Age 25 to 29",
    3: "Age 30 to 34",
    4: "Age 35 to 39",
    5: "Age 40 to 44",
    6: "Age 45 to 49",
    7: "Age 50 to 54",
    8: "Age 55 to 59",
    9: "Age 60 to 64",
    10: "Age 65 to 69",
    11: "Age 70 to 74",
    12: "Age 75 to 79",
    13: "Age 80 or older"
}

TETANUS = {
    1: "Yes, received Tdap",
    2: "Yes, received tetanus shot, but not Tdap",
    3: "Yes, received tetanus shot but not sure what type",
    4: "No, did not receive any tetanus shot in the past 10 years",
}

COVID = {
    1: "Yes",
    2: "No",
    3: "Tested positive using home test without a health professional"
}

In [22]:
data_copy = data.copy()

In [23]:
data_copy['State'] = data_copy['State'].map(STATE)
data_copy['Sex'] = data_copy['Sex'].map(SEX)
data_copy['General_Health'] = data_copy['General_Health'].map(GEN_HEALTH)
data_copy['Physical_Health_Days'] = data_copy['Physical_Health_Days'].replace(PHYS_MEN_HEALTH)
data_copy['Mental_Health_Days'] = data_copy['Mental_Health_Days'].replace(PHYS_MEN_HEALTH)
data_copy['Last_Checkup_Time'] = data_copy['Last_Checkup_Time'].map(LAST_CHECKUP)
data_copy['Physical_Activities'] = data_copy['Physical_Activities'].map(YES_NO_QUESTIONS)
data_copy['Sleep_Hours'] = data_copy['Sleep_Hours'].apply(SLEEP_TIME)
data_copy['Removed_Teeth'] = data_copy['Removed_Teeth'].map(TEETH_REMOVED)
#data_copy['Had_Heart_Attack'] = data_copy['Had_Heart_Attack'].map(YES_NO_QUESTIONS)
data_copy['Had_CHD_MI'] = data_copy['Had_CHD_MI'].map(YES_NO_QUESTIONS)
data_copy['Had_Angina'] = data_copy['Had_Angina'].map(YES_NO_QUESTIONS)
data_copy['Had_Stroke'] = data_copy['Had_Stroke'].map(YES_NO_QUESTIONS)
data_copy['Had_Asthma'] = data_copy['Had_Asthma'].map(YES_NO_QUESTIONS)
data_copy['Had_Skin_Cancer'] = data_copy['Had_Skin_Cancer'].map(YES_NO_QUESTIONS)
data_copy['Had_COPD'] = data_copy['Had_COPD'].map(YES_NO_QUESTIONS)
data_copy['Had_Depressive_Disorder'] = data_copy['Had_Depressive_Disorder'].map(YES_NO_QUESTIONS)
data_copy['Had_Kidney_Disease'] = data_copy['Had_Kidney_Disease'].map(YES_NO_QUESTIONS)
data_copy['Had_Arthritis'] = data_copy['Had_Arthritis'].map(YES_NO_QUESTIONS)
data_copy['Had_Diabetes'] = data_copy['Had_Diabetes'].map(DIABETES)
data_copy['Deaf_Or_Hard_Of_Hearing'] = data_copy['Deaf_Or_Hard_Of_Hearing'].map(YES_NO_QUESTIONS)
data_copy['Blind_Or_Vision_Difficulty'] = data_copy['Blind_Or_Vision_Difficulty'].map(YES_NO_QUESTIONS)
data_copy['Difficulty_Concentrating'] = data_copy['Difficulty_Concentrating'].map(YES_NO_QUESTIONS)
data_copy['Difficulty_Walking'] = data_copy['Difficulty_Walking'].map(YES_NO_QUESTIONS)
data_copy['Difficulty_Dressing_Bathing'] = data_copy['Difficulty_Dressing_Bathing'].map(YES_NO_QUESTIONS)
data_copy['Difficulty_Errands'] = data_copy['Difficulty_Errands'].map(YES_NO_QUESTIONS)
data_copy['Smoker_Status'] = data_copy['Smoker_Status'].map(SMOKER_STATUS)
data_copy['E_Cigarette_Usage'] = data_copy['E_Cigarette_Usage'].map(ECIGARETTES)
data_copy['Chest_Scan'] = data_copy['Chest_Scan'].map(YES_NO_QUESTIONS)
data_copy['Race_Ethnicity_Category'] = data_copy['Race_Ethnicity_Category'].map(RACE)
data_copy['Age_Category'] = data_copy['Age_Category'].map(AGE_CATEGORY)
data_copy['Height_In_Meters'] = data_copy['Height_In_Meters'] / 100
data_copy['Weight_In_Kilograms'] = data_copy['Weight_In_Kilograms'] / 100
data_copy['BMI'] = data_copy['BMI'] / 100
data_copy['Alcohol_Drinkers'] = data_copy['Alcohol_Drinkers'].map(YES_NO_QUESTIONS)
data_copy['HIV_Testing'] = data_copy['HIV_Testing'].map(YES_NO_QUESTIONS)
data_copy['Flu_Vax_Last_12'] = data_copy['Flu_Vax_Last_12'].map(YES_NO_QUESTIONS)
data_copy['Pneumo_Vax_Ever'] = data_copy['Pneumo_Vax_Ever'].map(YES_NO_QUESTIONS)
data_copy['Tetanus_Last_10Tdap'] = data_copy['Tetanus_Last_10Tdap'].map(TETANUS)
data_copy['High_Risk_Last_Year'] = data_copy['High_Risk_Last_Year'].map(YES_NO_QUESTIONS)
data_copy['Covid_Pos'] = data_copy['Covid_Pos'].map(COVID)

In [28]:
data_copy.shape

(445132, 40)

In [36]:
data.shape

(445132, 40)

In [37]:
# Create a DataFrame to display side by side
comparison_df = pd.DataFrame({
    'Original Data': data.isna().sum(),
    'New Data': data_copy.isna().sum()
})

In [38]:
# Display the comparison DataFrame
print(comparison_df)

                             Original Data  New Data
State                                    0         0
Sex                                      0         0
General_Health                           3      1198
Physical_Health_Days                     5     10927
Mental_Health_Days                       3      9067
Last_Checkup_Time                        3      8308
Physical_Activities                      2      1093
Sleep_Hours                              3      5453
Removed_Teeth                         1363     11360
Had_CHD_MI                               2      4405
Had_Angina                               2      1557
Had_Stroke                               2      1773
Had_Asthma                               2      3143
Had_Skin_Cancer                          2      2219
Had_COPD                                 7      2812
Had_Depressive_Disorder                  2      1926
Had_Kidney_Disease                       3      2633
Had_Arthritis                            3    

In [34]:
data_copy.dropna().shape[0]

0

In [35]:
data.dropna().shape[0]

364044

In [24]:
def describe_df(df: pd.DataFrame):
    print(f"The dataset contains {df.shape[1]} columns and {len(df)} rows")

    for col in df.columns:
        col_dtype = df[col].dtype
        
        print(f"\nColumn: {col} ({col_dtype})")
        if col_dtype == 'object':
            print(f"--- Percentage of NaNs: {df[col].isna().sum() / len(df[col]) * 100}")
            print(f"--- Unique values:\n {df[col].unique()}")
        else:
            print(f"--- Summary statistics:\n {df[col].describe()}")

In [25]:
describe_df(data_copy)

The dataset contains 40 columns and 445132 rows

Column: State (object)
--- Percentage of NaNs: 0.0
--- Unique values:
 ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']

Column: Sex (object)
--- Percentage of NaNs: 0.0
--- Unique values:
 ['Female' 'Male']

Column: General_Health (object)
--- Percentage of NaNs: 0.26913365024307395
--- Unique values:
 ['Very good' 'Excellent' 'Fair' 'Poor' 'Good' nan]


In [27]:
data_copy.dropna().to_csv(FINAL_FILE, index=False)