# Qiita Skin Metadata Normalization 
#### by Rebecca Hu
rows:  22772 cols:  1094
### Added column headers include:
qiita_host_age - normalized age in years
<br> qiita_host_age_units - 'years' repeated in all rows
<br> qiita_host_sex - normalized sex as 'male' or 'female'
<br> qiita_host_ethnicity_white - True if the host identifies as white, False or Nan otherwise
<br> qiita_host_ethnicity_black_or_african_american - True if the host identifies as black or African American, False or Nan otherwise
<br> qiita_host_ethnicity_hispanic_or_latino - True if the host identifies as hispanic or latino, False or Nan otherwise
<br> qiita_host_ethnicity_asian - True if the host identifies as asian, False or Nan otherwise
<br> qiita_host_ethnicity_american_indian_or_alaska_native - True if the host identifies as Native American, False or Nan otherwise
<br> qiita_host_ethnicity_native_hawaiian_or_other_pacific_islander - True if the host identifies as native Hawaiian or Pacific Islander, False or Nan otherwise
<br> qiita_host_ethnicity_other - True if host specified 'other' as ethnicity/race, False or Nan otherwise
<br> qiita_host_ethnicity_multiracial - True if host specified 'multi' or eqivalent as ethnicity/race, False or Nan otherwise
<br> qiita_host_multiracial - True if host specified multiple ethinicies/races, False or Nan otherwise
<br> qiita_host_ethnicity_combined - string values, most specific host race/ethinicity e.g. 'Japanese-Caucasian'
<br> qiita_host_weight - normalized & cleaned weight in kg
<br> qiita_host_weight_units - 'kg' repeated in all rows
<br> qiita_host_height - normalized and cleaned weight in cm
<br> qiita_host_height_units - 'cm' repeated in all rows
<br> qiita_host_bmi - float values, normalized host bmi
<br> qiita_host_healthy_weight - True if host falls under healthy weight category, False or Nan otherwise
<br> qiita_host_allergy - True if host has any allergies in allergy dictionaries, False or Nan otherwise
<br> qiita_host_cancer - True if host has any cancer in cancer dictionary, False or Nan otherwise
<br> qiita_host_ibd - True if host has inflammatory bowel disease, False or Nan otherwise
<br> qiita_host_ibd_type - 'cd' if host has Crohn's Disease, 'uc' if host has ulcerative colitis, 'not specified' if host has ibd but does not specify what kind, 'not applicable' if host does not have ibd
<br> qiita_host_diabetes - True if host has diabetes, False otherwise
<br> qiita_host_diabetes_subtype - 'type1' if host has Type I diabetes, 'type2' is host has Type II diabetes, 'not specified' if host has diabetes but does not specify what kind, 'not applicable' if host does not have diabetes
<br> qiita_host_disease - True if host has a miscellaneous disease in disease dictionary, False otherwise
<br> qiita_host_medication - True if host reported taking any medications, False otherwise
<br> qiita_host_healthy - False if host has any disease, allergy, or uses medication, True otherwise
#### Static Columns
qiita_sample_type - 'skin' repeated in all rows
<br> qiita_empo_1 - 'host-associated' repeated in all rows
<br> qiita_empo_2 - 'animal' repeated in all rows
<br> qiita_empo_3 - 'animal surface' repeated in all rows
<br> qiita_host_scientific_name - 'Homo sapiens' repeated in all rows
<br> qiita_host_taxid - 9606 repeated in all rows
<br> qiita_host_common_name - 'human' repeated in all rows
<br> qiita_env_feature - 'human-associated habitat' repeated in all rows

In [1]:
import pandas as pd
import numpy as np
import math

# Import CSV

In [2]:
#normalizes all true and false values to bool and all null values to nan
def clean_csv(file):
    df = pd.read_csv(file,header=0, sep ='\t', decimal = ',',
            true_values =['true','yes','y','Yes','Y','YES', 'Self-diagnosed', 'Diagnosed by a medical professional (doctor, physician assistant)'],
            false_values=['false','no','n','No','N','NO', 'I do not have this condition'],
            na_values=['Unknown','Unspecified','no_data','not applicable', 'Not applicable' ,'Missing: not collected', 'Missing: not provided',
                      'Missing: Not recorded', 'Missing: Restricted access' , 'Missing', 'Not provided', 'unspecified'],
            low_memory= False
            )
    return df

In [3]:
#Read Skin Metadata file
df = clean_csv('20190223_all_human_skin_metadata.txt')
num_row = df.shape[0]
num_col = df.shape[1]
print('rows: ', num_row, 'cols: ', num_col)

rows:  7386 cols:  979


# Normalize Age

In [8]:
#function to normalize ages and compile them into a list
def normalize_age(df):
    '''
    normalize age columns into years and compile them into a Series
    
    param: DataFrame containing all different age columns
    return: a Series containing all normalized age values
    '''
    age_list = []
    for row in range(0, num_row):
        
        if not math.isnan(float(df.iloc[row]['age'])):
            
            if df.iloc[row]['age_unit'] == 'years' or  df.iloc[row]['age_units'] == 'years':
                age_list.append(round(float(df.iloc[row]['age']), 3))
                
            elif df.iloc[row]['age_unit'] == 'weeks' or  df.iloc[row]['age_units'] == 'weeks':
                normalized_age = float(df.iloc[row]['age']) / 52
                age_list.append(round(normalized_age, 3))
                
            elif df.iloc[row]['age_unit'] == 'days' or  df.iloc[row]['age_units'] == 'days':
                normalized_age = float(df.iloc[row]['age']) / 365
                age_list.append(round(normalized_age, 3))
                
            elif df.iloc[row]['age_unit'] == 'minutes' or  df.iloc[row]['age_units'] == 'minutes':
                normalized_age = float(df.iloc[row]['age']) / 525600
                age_list.append(round(normalized_age, 3))
                
            else:
                age_list.append(float('Nan'))
                
        elif not math.isnan(float(df.iloc[row]['age_baby_days'])):
            normalized_age = float(df.iloc[row]['age_baby_days']) / 365
            age_list.append(round(normalized_age, 3))
            
        elif not math.isnan(float(df.iloc[row]['age_corrected'])):
            age_list.append(round(float(df.iloc[row]['age_corrected']), 3)) 
            
        elif not math.isnan(float(df.iloc[row]['age_in_years'])):
            age_list.append(round(float(df.iloc[row]['age_in_years']), 3))
            
        else:
            age_list.append(float('Nan'))
            
    return pd.Series(age_list)

In [9]:
#qiita_host_age column
df['qiita_host_age']= normalize_age(df)

In [10]:
#qiita_host_age_units column
age_units = ['years']
age_units_list = [] 

for i in range(0, num_row):
    age_units_list += age_units
    
df['qiita_host_age_units'] = pd.Series(age_units_list)

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


# Normalize Sex

In [11]:
#function to normalize sex
def normalize_sex(df):
    '''
    normalizes sex and gender into 'male' or 'female'
    
    param: DataFrame containing all sex/gender columns
    return: pd.Series containing all normalized sex 
    '''
    sex_list = []
    for row in range(0, num_row):
        if df.iloc[row]['gender'] == 'male' or df.iloc[row]['gender'] == 'female':
            sex_list.append(df.iloc[row]['gender'])
        elif df.iloc[row]['sex'] == 'male' or df.iloc[row]['sex'] == 'female':
            sex_list.append(df.iloc[row]['sex'])
        else:
            sex_list.append(float('Nan'))
    return pd.Series(sex_list)

In [12]:
#qiita_host_sex column
df['qiita_host_sex'] = normalize_sex(df)

# Normalize Ethnicity

In [13]:
ethnicity_dict = {
    'white': ['White', 'white' ,'Caucasian', 'caucasian'],
    'black or african american': ['African', 'african', 'Black-African American'], 
    'hispanic or latino': ['Hispanic', 'hispanic', 'Latino', 'latino', 'Mestizo', 'Yanomami'], 
    'asian': ['Asian', 'asian', 'japanese'], 
    'american indian or alaska native': ['Amerindian'], 
    'native hawaiian or other pacific islander': ['pacific.islander', 'pacific islander', 'Pacific', 'hawaiian'],
    'other': ['other', 'Other'],
    'multi': ['multi', 'Multi', 'more', 'More']
}    

In [14]:
#function to normalize ethnicity
def normalize_ethnicity(df, ethnicity):
    '''
    normalizes a particular ethnicity based on ethnicity dict above
    
    param: 
    - a DataFrame containing all ethnicity/race data
    - a string ethnicity to normalize ('ethnicity' must be a key in the above ethnicity dict)
    
    return: pd.Series containing all normalized, boolean values for the given ethnicity
    
    '''
    ethnicity_list = []
    for row in range(0, num_row):
        
        if type(df.iloc[row]['race']) == str:
            
            #replace 'caucasian' with 'white'
            curr = df.iloc[row]['race']
            if curr == 'Caucasian' or curr == 'caucasian':
                curr = 'white'
                
            for value in ethnicity_dict[ethnicity]:
                if value in curr:
                    ethnicity_list.append(True)
                    break
                elif value == ethnicity_dict[ethnicity][-1]:
                    ethnicity_list.append(False)
                    
        elif type(df.iloc[row]['raceethnicity']) == str:
            
            curr = df.iloc[row]['raceethnicity']
            if curr == 'Caucasian' or curr == 'caucasian':
                curr = 'white'
                
            for value in ethnicity_dict[ethnicity]:
                if value in curr:
                    ethnicity_list.append(True)
                    break
                elif value == ethnicity_dict[ethnicity][-1]:
                    ethnicity_list.append(False)
                    
        elif type(df.iloc[row]['ethnicity']) == str:
            
            curr = df.iloc[row]['ethnicity']
            if curr == 'Caucasian' or curr == 'caucasian':
                curr = 'white'
                
            for value in ethnicity_dict[ethnicity]:
                if value in curr:
                    ethnicity_list.append(True)
                    break
                elif value == ethnicity_dict[ethnicity][-1]:
                    ethnicity_list.append(False)
                    
        elif type(df.iloc[row]['ethnic_group']) == str:
            
            curr = df.iloc[row]['ethnic_group']
            if curr == 'Caucasian' or curr == 'caucasian':
                curr = 'white'
                
            for value in ethnicity_dict[ethnicity]:
                if value in curr:
                    ethnicity_list.append(True)
                    break
                elif value == ethnicity_dict[ethnicity][-1]:
                    ethnicity_list.append(False)
                    
        elif type(df.iloc[row]['ethnicgroup']) == str:
            
            #replace 'caucasian' with 'white'
            curr = df.iloc[row]['ethnicgroup']
            if curr == 'Caucasian' or curr == 'caucasian':
                curr = 'white'
                
            for value in ethnicity_dict[ethnicity]:
                if value in curr:
                    ethnicity_list.append(True)
                    break
                elif value == ethnicity_dict[ethnicity][-1]:
                    ethnicity_list.append(False)
                    
        else:
            ethnicity_list.append(False)
    return pd.Series(ethnicity_list)

In [15]:
#qiita_host_ethnicity columns 
df['qiita_host_ethnicity_white'] = normalize_ethnicity(df, 'white')
df['qiita_host_ethnicity_black_or_african_american'] = normalize_ethnicity(df, 'black or african american')
df['qiita_host_ethnicity_hispanic_or_latino'] = normalize_ethnicity(df, 'hispanic or latino')
df['qiita_host_ethnicity_asian'] = normalize_ethnicity(df, 'asian')
df['qiita_host_ethnicity_american_indian_or_alaska_native'] = normalize_ethnicity(df, 'american indian or alaska native')
df['qiita_host_ethnicity_native_hawaiian_or_other_pacific_islander'] = normalize_ethnicity(df, 'native hawaiian or other pacific islander')
df['qiita_host_ethnicity_other'] = normalize_ethnicity(df, 'other')
df['qiita_host_ethnicity_multiracial'] = normalize_ethnicity(df, 'multi')

In [16]:
#function to determine multiracial 
def multiracial(df):
    '''
    looks at previously create qiita_host_ethnicity_[blank] columns to determine whether the host has reported multiple ethnicities/races
    
    param: DataFrame containing all race/ethnicity data
    return: pd.Series containing boolean values of wether the host has multiple races/ethnicities reported
    '''
    ethnicity_columns = ['qiita_host_ethnicity_white', 'qiita_host_ethnicity_black_or_african_american',\
                         'qiita_host_ethnicity_hispanic_or_latino', 'qiita_host_ethnicity_asian',\
                         'qiita_host_ethnicity_american_indian_or_alaska_native',\
                         'qiita_host_ethnicity_native_hawaiian_or_other_pacific_islander',\
                         'qiita_host_ethnicity_multiracial']
    multiracial_list = []
    for row in range(0, num_row):
        count = 0
        for ethnicity in ethnicity_columns:
            if df.iloc[row][ethnicity] == True:
                count += 1
        if count >= 2:
            multiracial_list.append(True)
        else:
            multiracial_list.append(False)  
    return pd.Series(multiracial_list)

In [17]:
#qiita_host_multiracial column
df['qiita_host_multiracial'] = multiracial(df)

In [18]:
#function to clean ethnicity imputs
def clean_ethnicity_inputs(entry):
    '''
    cleans the strings in ethnicity inputs 
    param: a string entry in an ethnicity column
    returns: a normalized string
    '''
    entry = entry.replace('nA', 'n A')
    entry = entry.lower()
    entry = entry.replace('half.', '')
    entry = entry.replace('.', ' ')
    entry = entry.replace('black-', '')
    entry = entry.replace('caucasian', 'white')
    entry = entry.replace('mestizohispano', 'mestizo')   
    entry = entry.replace('amerindian', 'american indian')
    return entry   

#functions to normalize ethnicity 
def combined_race(df):
    '''
    creates a column that has all reported host ethncity/races, cleaned but not normalized at all
    eg.) black, japanese, mestizo
    
    param: dataframe
    returns: a pd.Series with merged, but not currated with all ethnicities/races for the given host
    '''
    ethnicity_list = []
    for row in range(0, num_row):
        if type(df.iloc[row]['race']) == str:
            ethnicity_list.append(clean_ethnicity_inputs(df.iloc[row]['race']))
        elif type(df.iloc[row]['raceethnicity']) == str:
            ethnicity_list.append(clean_ethnicity_inputs(df.iloc[row]['raceethnicity']))
        elif type(df.iloc[row]['ethnicity']) == str:
            ethnicity_list.append(clean_ethnicity_inputs(df.iloc[row]['ethnicity']))
        elif type(df.iloc[row]['ethnic_group']) == str:
            ethnicity_list.append(clean_ethnicity_inputs(df.iloc[row]['ethnic_group']))
        elif type(df.iloc[row]['ethnicgroup']) == str:
            ethnicity_list.append(clean_ethnicity_inputs(df.iloc[row]['ethnicgroup']))
        else:
            ethnicity_list.append(float('Nan'))
    return pd.Series(ethnicity_list)

In [19]:
#qiita_host_ethnicity column
df['qiita_host_ethnicity_combined'] = combined_race(df)

# Normalize Weight

In [20]:
#function to correct weight in kg
# Everyone should fall between 0.5kg and 200kg, adults should be at least 20kg
def correct_weight(row, weight):
    '''
    cleans & normalizes weight inputs with specified bounds to kilograms
    
    param: the row of the host and the weight input
    return: a cleaned and normalized weight value, or nan if the value is out of bounds
    
    '''
    age = df.iloc[row]['qiita_host_age']
    if weight < 0.5 or weight > 200:
        return float('Nan')
    if age >= 18:
        if weight < 20:
            return float('Nan')
    else:
        return round(float(weight), 3)
    return round(float(weight), 3)

#function to normalize weight 0.5kg to 200 kg
def normalize_weight(df):
    '''
    imputes cleaned weight values into a pd.Series
    
    param: DataFrame
    return: pd.Series of merged and cleaned values
    '''
    weight_list = []
    for row in range(0, num_row):
        if not math.isnan(float(df.iloc[row]['weight'])):
            weight = correct_weight(row, float(df.iloc[row]['weight']))
            weight_list.append(weight)
        elif not math.isnan(float(df.iloc[row]['weight_kg'])):
            weight = correct_weight(row, float(df.iloc[row]['weight_kg']))
            weight_list.append(weight)
        elif not math.isnan(float(df.iloc[row]['tot_mass'])):
            weight = correct_weight(row, float(df.iloc[row]['tot_mass']))
            weight_list.append(weight)
        else:
            weight_list.append(float('Nan'))
    return pd.Series(weight_list)

In [21]:
#qiita_host_weight column
df['qiita_host_weight'] = normalize_weight(df)
#df['qiita_host_weight'].isnull.value_counts()

In [22]:
#qiita_host_weight_units column
weight_units = ['kg']
weight_units_list = []

for i in range(0, num_row):
    weight_units_list += weight_units
    
df['qiita_host_weight_units'] = pd.Series(weight_units_list)

# Normalize Height

In [23]:
#function to correct height in cm
# Everyone should fall between 48cm and 210cm, adults should be at least 105cm
def correct_height(row, height):
    '''
    cleans & normalizes height inputs with specified bounds to centimeters
    
    param: the row of the host and the height input
    return: a cleaned and normalized height value, or nan if the value is out of bounds
    
    '''
    age = df.iloc[row]['qiita_host_age']
    if height < 48 or height > 210:
        return float('Nan')
    if age >= 18:
        if height < 20:
            return float('Nan')
    else:
        return round(float(height), 3)
    return round(float(height), 3)

#function to normalize height
def normalize_height(df):
    '''
    imputes cleaned height values into a pd.Series
    
    param: DataFrame
    return: pd.Series of merged and cleaned values
    '''
    height_list = []
    for row in range(0, num_row):
        if not math.isnan(float(df.iloc[row]['height'])): #all from study 11052
            height = correct_height(row, float(df.iloc[row]['height']))
            height_list.append(height)
        elif not math.isnan(float(df.iloc[row]['height_cm'])): #all from AGP 10317
            height = correct_height(row, float(df.iloc[row]['height_cm']))
            height_list.append(height)
        elif not math.isnan(float(df.iloc[row]['height_m'])): #all from 2010
            height = correct_height(row, float(df.iloc[row]['height_m']) * 100)
            height_list.append(height)
        #elif not math.isnan(float(df.iloc[row]['height_or_length'])): #all from 2024 & 959
            #height = correct_height(row, float(df.iloc[row]['height_or_length']) * 100)
            #height_list.append(height)
        else:
            height_list.append(float('Nan'))
    return pd.Series(height_list)

In [24]:
#qiita_host_height column
df['qiita_host_height'] = normalize_height(df)
#df['qiita_host_height'].value_counts()

In [25]:
#qiita_host_height_units column
height_units = ['cm']
height_units_list = []

for i in range(0, num_row):
    height_units_list += height_units
    
df['qiita_host_height_units'] = pd.Series(height_units_list)

# Normalize BMI

In [26]:
#function to normalize and calculate bmi
#organizes bmi values and calculates it for samples with age > 18 w/o bmi
#some samples only have bmi categories
#corrects by making sure bmi is only between 12 & 42

def correct_bmi(bmi):
    '''
    keeps values within specified BMI bounds
    
    param: a bmi value
    return: a cleaned bmi value, or nan if value is not within bounds
    '''
    if bmi <= 42 and bmi >= 12:
        return round(bmi, 3)
    else:
        return float('Nan')

def normalize_bmi(df):
    '''
    creates a column of merged and cleaned BMI values
    
    param: dataframe
    returns a series with merged and cleaned BMI values
    '''
    bmi_list = []
    for row in range(0, num_row):
        if not math.isnan(float(df.iloc[row]['body_mass_index'])):
            curr = correct_bmi(float(df.iloc[row]['body_mass_index']))
            bmi_list.append(curr)
        elif not math.isnan(float(df.iloc[row]['bmi_corrected'])):
            curr = correct_bmi(float(df.iloc[row]['bmi_corrected']))
            bmi_list.append(curr)
        elif not math.isnan(float(df.iloc[row]['bmi'])):
            curr = correct_bmi(float(df.iloc[row]['bmi']))
            bmi_list.append(curr)

        #check if has age
        elif not math.isnan(float(df.iloc[row]['age'])):
            #check if >18
            if df.iloc[row]['qiita_host_age'] >= 18:
                #check if has height and weight
                if not math.isnan(float(df.iloc[row]['qiita_host_height'])) \
                and not math.isnan(float(df.iloc[row]['qiita_host_weight'])):
                    height = float(df.iloc[row]['qiita_host_height'])
                    weight = float(df.iloc[row]['qiita_host_weight'])
                    bmi = weight / ((height) ** 2)
                    bmi_list.append(round(bmi, 3))
                else:
                    bmi_list.append(float('Nan'))
            else:
                bmi_list.append(float('Nan'))
        else:
            bmi_list.append(float('Nan'))
    return pd.Series(bmi_list)

In [27]:
#qiita_host_bmi
df['qiita_host_bmi'] = normalize_bmi(df)

# Normalize Healthy Weight

In [28]:
#function to determine if weight is healthy
#add to only check for bmi if age is >=18
def healthy_weight(df):
    '''
    creates boolean values for hosts of healthy weight
    
    param: dataframe
    returns: a column with boolean values, True if host falls within healthy bmi bounds, false otherwise
    '''
    healthy_weight_list = []
    for row in range(0, num_row):
        if float(df.iloc[row]['qiita_host_bmi']) >= 18.5 and float(df.iloc[row]['qiita_host_bmi']) <= 24.9:
            healthy_weight_list.append(True)
        elif float(df.iloc[row]['qiita_host_bmi']) < 18.5 or float(df.iloc[row]['qiita_host_bmi']) > 24.9:
            healthy_weight_list.append(False)
        else:
            healthy_weight_list.append(float('Nan'))
    return pd.Series(healthy_weight_list)

In [29]:
#qiita_host_healthy_weight
df['qiita_host_healthy_weight'] = healthy_weight(df) 

# Normalize Allergy

In [30]:
bool_allergy_list = ['peanutallergy', 'shellfishallergy', 'treenutallergy', 'allergic_contact_dermatitis', 
        'allergic_rhinitis_', 'allergic_to_other', 
        'allergic_to_peanuts', 'allergic_to_shellfish', 'allergic_to_tree_nuts', 'allergic_to_unspecified', 
                     'beestingallergies', 'drugallergies', 
        'non_food_allergies_beestings', 'non_food_allergies_drug_eg_penicillin', 'non_food_allergies_pet_dander', 
        'non_food_allergies_poison_ivyoak', 'non_food_allergies_sun', 'non_food_allergies_unspecified',  
        'poisonivyoakallergies', 'seasonal_allergies', 'seasonalallergies', 'sunallergies']
str_allergy_list = ['allergy', 'otherallergies']

In [31]:
#function for normalize allergies
def normalize_allergy(df):
    '''
    creates a column normalizing allergies
    
    param: dataframe
    returns: columns with boolean values, True if any allergies, false otherwise
    '''
    allergy_list = []
    for row in range(0, num_row):
        has_allergy = False
        for allergy in bool_allergy_list:
            if df.iloc[row][allergy] == False:
                has_allergy = False
            elif df.iloc[row][allergy] == True:
                has_allergy = True
                #if debug: print(allergy + ':' + str(df.iloc[row][allergy]))
                break

        if df.iloc[row]['allergic_to_i_have_no_food_allergies_that_i_know_of'] == False:
            has_allergy = True                
                
        if has_allergy != True:
            for str_allergy in str_allergy_list:
                temp = df[str_allergy].fillna('not provided')
                if temp[row] == 'no' or temp[row] == 'not provided':
                    has_allergy = False
                else:
                    has_allergy = True
                    break
        allergy_list.append(has_allergy)
    return pd.Series(allergy_list)

In [32]:
#qiita_host_allergy
df['qiita_host_allergy'] = normalize_allergy(df)

# Normalize Cancer

In [33]:
cancer_list = ['bladder_cancer', 'brain_cancer', 'breast_cancer', 'cancer',
       'cervical_cancer', 'colon_cancer', 'melanoma', 'non_hodgkin_lymphoma',
       'endometrial_cancer', 'kidney_cancer', 'lung_cancer', 'leukemia',
       'non_melanoma_skin_cancer', 'ovarian_cancer', 'pancreatic_cancer',
       'prostate_cancer', 'rectal_cancer', 'stomach_cancer', 'thyroid_cancer']

In [34]:
#function to normalize cancer
def normalize_cancer(df):
    '''
    creates a column normalizing cancer
    
    param: dataframe
    returns: columns with boolean values, True if any cancers, false otherwise
    '''
    result_list = []
    for row in range(0, num_row):
        has_cancer = False
        for cancer in cancer_list:
            if df.iloc[row][cancer] == True:
                has_cancer = True
                break
        result_list.append(has_cancer)
    return pd.Series(result_list)

In [35]:
#qiita_host_cancer column
df['qiita_host_cancer'] = normalize_cancer(df)

# Normalize IBD

In [36]:
#do not include 'subset_ibd'
ibd_dict = {'ibd': ['Colitis', 'Yes.IBS', 'Crohns', 'Diagnosed by a medical professional (doctor, physician assistant)'], 
            'ibd_diagnosis': ['Ulcerative colitis', "Crohn's disease"] , 
            'ibd_diagnosis_refined': ['Ulcerative colitis', "Colonic Crohn's Disease", 'Microcolitis'],
           'ulcerative_colitis': [True],
            'crohns_disease': [True]
           }

In [37]:
#function to normalize ibd
def normalize_ibd(df):
    '''
    creates a column normalizing irritable bowel disease
    
    param: dataframe
    returns: columns with boolean values, True if ibd, false otherwise
    '''
    ibd_list = []
    for row in range(0, num_row):
        has_ibd = False
        for column in ibd_dict:
            temp = df[column].fillna('not provided')
            if temp[row] in ibd_dict[column]:
                has_ibd = True
                break
        ibd_list.append(has_ibd)
    return pd.Series(ibd_list)

In [38]:
#qiita_host_ibd column
df['qiita_host_ibd'] = normalize_ibd(df)
df['qiita_host_ibd'].value_counts()

False    22579
True       193
Name: qiita_host_ibd, dtype: int64

In [39]:
ibd_true_count = 0
ibd_false_count = 0
for row in range(0, num_row):
    if df.iloc[row]['qiita_host_ibd'] == True:
        if df.iloc[row]['qiita_study_id'] == 10317:
            ibd_true_count += 1
    else:
        if df.iloc[row]['qiita_study_id'] == 10317:
            ibd_false_count += 1
print(ibd_true_count)
print(ibd_false_count)

10
1151


In [40]:
#qiita_host_ibd_type
def ibd_subtype(df):
    '''
    creates a column specifying ibd subtypes
    
    param: dataframe
    return: a column specifying the specific types of ibd (uc, ic, not specified, or not applicable)
    '''
    subtypes = []
    for row in range(0, num_row):
        if df.iloc[row]['crohns_disease'] == True:
            subtypes.append('cd')
        elif df.iloc[row]['ulcerative_colitis'] == True:
            subtypes.append('uc')
        elif df.iloc[row]['ibd'] == 'Colitis':
            subtypes.append('uc')
        elif df.iloc[row]['ibd'] == 'Crohns':
            subtypes.append('cd')
        elif df.iloc[row]['ibd_diagnosis'] == 'Ulcerative colitis':
            subtypes.append('uc')
        elif df.iloc[row]['ibd_diagnosis'] == "Crohn's disease":
            subtypes.append('cd')
        elif df.iloc[row]['ibd_diagnosis_refined'] == 'Ulcerative colitis':
            subtypes.append('uc')
        elif df.iloc[row]['ibd_diagnosis_refined'] == "Colonic Crohn's Disease":
            subtypes.append('cd')
        elif df.iloc[row]['qiita_host_ibd'] == True:
            subtypes.append('not specified')
        else:
            subtypes.append('not applicable')
    return pd.Series(subtypes)

In [41]:
#qiita_host_ibd_type
df['qiita_host_ibd_type'] = ibd_subtype(df)
df['qiita_host_ibd_type'].value_counts()

not applicable    22579
uc                   97
not specified        58
cd                   38
Name: qiita_host_ibd_type, dtype: int64

In [42]:
na_count = 0
ns_count = 0
uc_count = 0
cd_count = 0
for row in range(0, num_row):
    if df.iloc[row]['qiita_host_ibd_type'] == 'not applicable':
        if df.iloc[row]['qiita_study_id'] == 10317:
            na_count += 1
    elif df.iloc[row]['qiita_host_ibd_type'] == 'not specified':
        if df.iloc[row]['qiita_study_id'] == 10317:
            ns_count += 1
    elif df.iloc[row]['qiita_host_ibd_type'] == 'uc':
        if df.iloc[row]['qiita_study_id'] == 10317:
            uc_count += 1
    elif df.iloc[row]['qiita_host_ibd_type'] == 'cd':
        if df.iloc[row]['qiita_study_id'] == 10317:
            cd_count += 1
print('not applicable: ', na_count)
print('not specified: ', ns_count)
print('uc: ', uc_count)
print('cd: ', cd_count)

not applicable:  1151
not specified:  10
uc:  0
cd:  0


# Normalize Diabetes

In [43]:
#do not include 'subset_diabetes'
diabetes_dict = {'diabetes' : ['yes.type.I', 'Diagnosed by a medical professional (doctor, physician assistant)', 'true'], 
                 'diabetes_mellitustype_1' : [True], 
                 'diabetes_mellitustype_2' : [True],
                 'diabetes_type' : ['Type II diabetes']}

In [44]:
#function to normalize diabetes
def normalize_diabetes(df):
    '''
    creates a columns with boolean values for diabetes
    
    param: dataframe
    return: column wtih boolean values, true if host has diabetes, false otherwise
    '''
    diabetes_list = []
    for row in range(0, num_row):
        has_diabetes = False
        for column in diabetes_dict:
            temp = df[column].fillna('not provided')
            if temp[row] in diabetes_dict[column]:
                has_diabetes = True
                break
        diabetes_list.append(has_diabetes)
    return pd.Series(diabetes_list)

In [45]:
#qiita_host_diabetes column
df['qiita_host_diabetes'] = normalize_diabetes(df)
df['qiita_host_diabetes'].value_counts()

False    22675
True        97
Name: qiita_host_diabetes, dtype: int64

In [46]:
true_count = 0
false_count = 0
for row in range(0, num_row):
    if df.iloc[row]['qiita_host_diabetes'] == True and df.iloc[row]['qiita_study_id'] == 10317:
        true_count += 1
    elif df.iloc[row]['qiita_host_diabetes'] == False and df.iloc[row]['qiita_study_id'] == 10317:
        false_count += 1
print('true:', true_count)
print('false: ', false_count)

true: 19
false:  1142


In [47]:
#qiita_host_diabetes_subtype
def diabetes_subtype(df):
    '''
    creates a column specifying types of diabetes
    
    param: dataframe
    returns: a column with specific diabetes types (type1, type2, no type, not applicable)
    '''
    subtypes = []
    for row in range(0, num_row):
        if df.iloc[row]['diabetes_mellitustype_1'] == True:
            subtypes.append('type1')
        elif df.iloc[row]['diabetes_mellitustype_2'] == True:
            subtypes.append('type2')
        elif df.iloc[row]['diabetes'] == 'yes.type.I':
            subtypes.append('type1')
        elif df.iloc[row]['diabetes_type'] == 'Type II diabetes':
            subtypes.append('type2')
        elif df.iloc[row]['qiita_host_diabetes'] == True:
            subtypes.append('no type')
        else:
            subtypes.append('not applicable')
    return pd.Series(subtypes)

In [48]:
#qiita_host_diabetes_subtype
df['qiita_host_diabetes_subtype'] = diabetes_subtype(df)
df['qiita_host_diabetes_subtype'].value_counts()

not applicable    22675
type1                72
type2                21
no type               4
Name: qiita_host_diabetes_subtype, dtype: int64

In [49]:
na_count = 0
ns_count = 0
count1 = 0
count2 = 0
for row in range(0, num_row):
    if df.iloc[row]['qiita_host_diabetes_subtype'] == 'not applicable':
        if df.iloc[row]['qiita_study_id'] == 10317:
            na_count += 1
    elif df.iloc[row]['qiita_host_diabetes_subtype'] == 'no type':
        if df.iloc[row]['qiita_study_id'] == 10317:
            ns_count += 1
    elif df.iloc[row]['qiita_host_diabetes_subtype'] == 'type1':
        if df.iloc[row]['qiita_study_id'] == 10317:
            count1 += 1
    elif df.iloc[row]['qiita_host_diabetes_subtype'] == 'type2':
        if df.iloc[row]['qiita_study_id'] == 10317:
            count2 += 1


In [50]:
print('not applicable: ', na_count)
print('not specified: ', ns_count)
print('type1: ', count1)
print('type2: ', count2)

not applicable:  1142
not specified:  4
type1:  0
type2:  15


# Normalize Miscellaneous Diseases

In [51]:
true_values =[True, 'true','yes','y','Yes','Y','YES', 'Diagnosed by a medical professional (doctor, physician assistant)']
false_values = [False, 'no', 'none', 'NO', 'n', 'N', 'NONE', 'No', 'false', 'False', 'f', 'F']

In [52]:
disease_dict = {'canker_sores_': true_values, 'cardiovascular_disease': [True], 'celiac_disease': [True],
       'chronicliverdisease_cirrhosis': [True], 'chronicobstructivepulm_disease': [True],
       'disease': ['sinusproblems_candida', 'asthma_allergictopepper', 'sinusproblems', 'asthma', 'asthma_dermatitis',
                    'thyroidhealthproblems', 'allergictopenicillin_fibromyalgia'], 
       'fibrocystic_breast_disease': [True], 'footeczema': true_values,
                'gastroesophagealreflux_disease': [True],
       'graves_disease': [True], 'hirschsprungs_disease': [True], 'huntingtons_disease': [True],
       'kawasaki_disease': [True], 'kidney_disease': [True], 'liver_disease': [True], 'lung_disease': [True],
       'menieres_disease': [True], 'nonalcoholicfattyliver_disease': [True],
       'osgood_schlatter_disease': [True], 'parkinsons_disease': [True], 'peyronies_disease': [True],
       'polycystic_kidney_disease': [True], 
       'von_willebrand_disease': [True],
       'acne': true_values, 'acute_kidney_failure': true_values, 'acute_liver_failure': true_values,
       'age_related_cataract': true_values, 'age_related_hearing_loss': true_values, 'age_related_macular_degen': true_values,
       'alopecia_areata': true_values, 'alzheimers': true_values, 'amyotrophic_lateral_sclerosis_': true_values, 'angina': true_values, 
       'aortic_aneurysm': true_values, 'appendicitis': true_values, 'asd': true_values, 'asthma': true_values, 'atherosclerosis': true_values, 
       'atrial_fibrillation': true_values, 'autoimmune_hemolytic_anemia': true_values, 'barretts_esophagus': true_values, 
       'bartholins_cyst': true_values, 'bells_palsy': true_values, 'cdiff': true_values, 'cdiff_positive_yn': true_values, 'chickenpox': true_values,
       'chronic_bronchitis': true_values, 'chronic_kidney_failure': true_values, 'chronic_recurrent_tonsillitis': true_values,
       'chronic_sinusitis': true_values, 'chronicliverdisease_cirrhosis': true_values, 'chronicobstructivepulm_disease': true_values,
       'colon_polyps': true_values, 'congestive_heart_failure': true_values, 'cystic_fibrosis': true_values, 'dandruff': true_values, 
       'deep_vein_thrombosis': true_values, 'depression_bipolar_schizophrenia': true_values, 'dermatographia': true_values, 
       'dilated_cardiomyopathy':true_values, 'diverticulosis': true_values, 'eczema': true_values, 'emphysema': true_values, 'endometriosis': true_values, 
       'epilepsy': true_values, 'epilepsy_or_seizure_disorder': true_values, 'fibromyalgia': true_values, 
                'fungal_overgrowth': true_values,
                'gallstones': true_values, 
       'gbs': true_values, 'gilbert_syndrome': true_values, 'hemochromatosis': true_values, 'hiv': true_values, 
       'irritable_bowel_syndrome_': true_values, 'mental_illness': true_values, 'mental_illness_type_anorexia_nervosa': true_values, 
       'mental_illness_type_bipolar_disorder': true_values, 'mental_illness_type_bulimia_nervosa': true_values, 
       'mental_illness_type_depression': true_values, 'mental_illness_type_ptsd_posttraumatic_stress_disorder': true_values, 
       'mental_illness_type_schizophrenia': true_values, 'mental_illness_type_substance_abuse': true_values,
       'mental_illness_type_unspecified': true_values, 'long_qt_syndrome': true_values, 'malaria': true_values, 'ovarian_cysts': true_values, 
       'patientdied': true_values, 'polycystic_kidney_disease': true_values, 'polycystic_ovary_syndrome_': true_values, 'sickle_cell_anemia': true_values, 
       'sjogrens_syndrome_': true_values, 'urinary_tract_infection_': true_values, 'wolff_parkinson_white_syndrome': true_values
               }

In [53]:
#function to normalize disease
def normalize_disease(df):
    '''
    creates a boolean column for general diseases
    
    param: dataframe
    return: a column with True if any of the above specified columns are true, false otherwise
    '''
    disease_list = []
    for row in range(0, num_row):
        has_disease = False
        for column in disease_dict:
            temp = df[column].fillna('not provided')
            if temp[row] in disease_dict[column]:
                has_disease = True
                break
        disease_list.append(has_disease)
    return pd.Series(disease_list)

In [54]:
#qiita_host_disease column
df['qiita_host_disease'] = normalize_disease(df)

# Normalize Medication Use

In [55]:
med_dict = {'antibiotic_disturbance': true_values,
       'antibiotic_history': true_values, 'antibiotics': true_values, 'antibiotics_after_birth': true_values,
       'antibiotics_at_birth': true_values, 'idantibioticdisturbance': true_values,
       'sampleantibioticdisturbance': true_values, 'subset_antibiotic_history': false_values, 'acne_medication': true_values, 
        'acne_medication_otc': true_values, 'medication_drug': true_values,
       'overthecounterfacialacne': true_values,
        'prescriptionforfacialacne': true_values,
       'other_medications': true_values, 
        'othermedication': ['yes.claritin.buproprion.citalopram','yes.ibuprofen.maxalt','yes.ibuprofen.hydrocodone',
                            'yes.amitriptyline','yes.allergies','yes.hypercare.for.hyperhycrosis.excessive.sweating',
                            'yes.lorazepan.Ativan.PRN','yes.zolmitriptan','yes.muscle.relaxants.Aleve',
                            'yes.advil.tylenol.and.claritin.as.needed',
                            'yes.sertraline.HCL.Zoloft.balsalazide.disodium.alprazolam.infliximab.ergocalciferol',
                            'yes.take.zyrted.for.seasonal.allergies','yes.chronic.pain.control.Lyrica.and.Ultram.allegra',
                            'yes.codeine.cough','yes.androgel.gabapentin.metrotropolol.calcium.blocker',
                            'yes.acyclovir.usually.every.few.months.anti.viral','yes.celexa.sonata','yes.NSAIDs',
                            'yes.zyrtec','yes.aderoll.ADD.medication.','yes.metformin.for.polycystic.ovarian.syndrom',
                            'yes.fluoxetine.adderall.crestor','yes.insulin.novolog','yes.prozac.clonazepam.lamotrigine',
                            'yes.ambien','yes.bupropion.hydrochloride','yes.paxil.humatrope','yes.levothyroxin.fluoxetine',
                            'yes.baclaphin.naproxin.citalopram','yes.ferrous.sulfate.iron.pills.melatonin','yes.ibuprofen',
                            'yes.cold','yes.vyvanase.dexadrine','yes.celexa'], 
        'drug_usage': ['Ibuprofen','TRUE','birth control','Prevacid','Claritin','Tums, Advil','Prilosec','Ortho Tri-cyclen Lo',
                       'antacids','allergy medication','Advil, Lyrica','thyroid hormone','OrthoNovum','Lamictal, lithium carb',
                       'Prilosec, Simuastatin, Diazide, Atenolol, Hydroxyzine, Claritan','Zantec, Tums, Zertec',
                       'baby aspirin, Omerprazole, Isosorbride, Lisinopril, Lovastatin, Plavix, Altenolol, Klosnopin, Neurontin, Requip,Finasteride, Acyclovir, Xanax, Nitroglycercin',
                       'Ambien','DayQuil','baby aspirin, Zantac','Klonopin, Prozac, Immipramine, Warfarin, Cozaar, Tenex, Calan','Cephalexin',
                       'baby aspirin, beta-blockers, statins, ace inhibitors','Nuvaring','Fluocinonide cream (eczema)','colace','albuterol inhaler',
                       'Lisinopril, Simvastatin, Hydrochilorol','Zicam, OrthoTriCyclinLo','Evista, Fosamax, Prilosec','Actonel','Aleve, Claritin',
                       'Nasonex','Levothyroxine','Septra, Zofran, chemotherapy drugs','Equate, Azor, Hydrocholothiazide, aspirin, Nexium',
                       'Advil, Benadryl','Gaviscon' ,'Benadryl','Tylenol, Oragel','baby aspirin, Lipitor, Diovan HCT','Advil','Propecia','Omeprazole'],
       'medication_drug': ['ophthalmicerythromycin','Prenatals' ,'Prenatals_Iron' ,'Toradol','Percoset','Percocet' ,'Motrin' ,
                           'Zantac_Intestinex_Vitamins','VitaminA_aloe','Tramadol' ,'Ancef' ,'VitaminC_Prenatals' ,
                           'Tylenolwithcodeine_Prenatals','Demerol_Percoset' ,'Percoset_Prenatals','Percoset_Hematron_Pepsid',
                           'Ampicillin_Pitocin_Demerol','Cataflam' ,'Pitocin','Cytotec_morphine','Iron' ,
                           'Ancef_Bicitra_Synthroid','Cytotec','Ampicillin_Pitocin' ,'Motrin_Prenatals','Prenatalvitamins',
                           'Ancef_Pitocin','Synthroid' ,'Keflex','Acetaminophen' ,'Sulphur_Heel','Clindamycin_Tamiflu',
                           'Panadol_Prenatals','Mefoxin_Pitocin_Claritin','Zantac' ,'Flexeril','Pedialyte_Desitin',
                           'VitaminA_DandC_calaminelotion','Keflex_Ancef_Prenatals','NA_Prenatals','VitaminAandD',
                           'VitaminA_DandC']}

In [56]:
#function to normalize medication use
def normalize_medication(df):
    '''
    creates a boolean column for medication use
    
    param: a dataframe
    return: a column of boolean values, True if host used medication, false otherwise
    '''
    med_list = []
    for row in range(0, num_row):
        med = False
        for column in med_dict:
            temp = df[column].fillna('not provided')
            if temp[row] in med_dict[column]:
                med = True
                break
        med_list.append(med)
    return pd.Series(med_list)

In [None]:
#qiita_host_disease2 column
df['qiita_host_medication'] = normalize_medication(df)

# Normalize Host Health

In [None]:
disease_columns = ['qiita_host_disease', 'qiita_host_diabetes', 'qiita_host_ibd', 'qiita_host_cancer', 'qiita_host_allergy',
                  'qiita_host_healthy_weight', 'qiita_host_medication']

In [None]:
#function to normalize healthy
def normalize_healthy(df):
    '''
    creates a boolean column for a general "healthy" host
    
    param: a dataframe
    return: a boolean column , true if host is healthy and false otherwise
    '''
    healthy_list = []
    for row in range(0, num_row):
        healthy = True
        for disease in disease_columns:
            if df.iloc[row][disease] == True:
                healthy = False
                break
        healthy_list.append(healthy)
    return pd.Series(healthy_list)

In [None]:
#qiita_host_healthy
df['qiita_host_healthy'] = normalize_healthy(df)

# Static columns

In [None]:
#qiita_sample_type column
sample_type = ['skin']
sample_type_list = []

for i in range(0, num_row):
    sample_type_list += sample_type
    
df['qiita_host_sample_type'] = pd.Series(sample_type_list)

In [None]:
#qiita_empo_1 column 
empo_1 = ['host_associated']
empo_1_list = []

for i in range(0, num_row):
    empo_1_list += empo_1
    
df['qiita_empo_1'] = pd.Series(empo_1_list)

In [None]:
#qiita_empo_2 column 
empo_2 = ['animal']
empo_2_list = []

for i in range(0, num_row):
    empo_2_list += empo_2
    
df['qiita_empo_2'] = pd.Series(empo_2_list)

In [None]:
#qiita_empo_3 column 
empo_3 = ['animal surface']
empo_3_list = []

for i in range(0, num_row):
    empo_3_list += empo_3
    
df['qiita_empo_3'] = pd.Series(empo_3_list)

In [None]:
#qiita_host_scientific_name column 
scientific_name = ['Homo sapiens']
scientific_name_list = []

for i in range(0, num_row):
    scientific_name_list += scientific_name
    
df['qiita_host_scientific_name'] = pd.Series(scientific_name_list)

In [None]:
#qiita_host_taxid column 
taxid = [9606]
taxid_list = []

for i in range(0, num_row):
    taxid_list += taxid
    
df['qiita_host_taxid'] = pd.Series(taxid_list)

In [None]:
#qiita_host_common_name column
common_name = ['human']
common_name_list = []

for i in range(0, num_row):
    common_name_list += common_name
    
df['qiita_host_common_name'] = pd.Series(common_name_list)

In [None]:
#qiita_env_feature column
env_feature = ['human-associated habitat']
env_feature_list = []

for i in range(0, num_row):
    env_feature_list += env_feature
    
df['qiita_env_feature'] = pd.Series(env_feature_list)

# Write to CSV

In [None]:
df.set_index('#SampleID')

In [None]:
df.to_csv('skin_02_07_19.tsv', sep = '\t', na_rep = 'not provided', index = False)
df.to_csv('skin_02_07_19.csv', sep = ',', na_rep = 'not provided', index = False)