In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Phase 1 : Data Preparation

Reading the dataset and setting SUBJECT_ID as the index column

In [2]:
df = pd.read_csv('food-allergy-analysis-Zenodo.csv', index_col='SUBJECT_ID')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333200 entries, 1 to 333200
Data columns (total 49 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   BIRTH_YEAR               333200 non-null  int64  
 1   GENDER_FACTOR            333200 non-null  object 
 2   RACE_FACTOR              333200 non-null  object 
 3   ETHNICITY_FACTOR         333200 non-null  object 
 4   PAYER_FACTOR             333200 non-null  object 
 5   ATOPIC_MARCH_COHORT      333200 non-null  bool   
 6   AGE_START_YEARS          333200 non-null  float64
 7   AGE_END_YEARS            333200 non-null  float64
 8   SHELLFISH_ALG_START      5246 non-null    float64
 9   SHELLFISH_ALG_END        1051 non-null    float64
 10  FISH_ALG_START           1796 non-null    float64
 11  FISH_ALG_END             527 non-null     float64
 12  MILK_ALG_START           7289 non-null    float64
 13  MILK_ALG_END             4580 non-null    float64
 14  SOY_

In [3]:
df.head()

Unnamed: 0_level_0,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,SHELLFISH_ALG_END,...,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,,...,,,,,,,,,,
2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,,...,,,,,,,,12.262834,18.880219,2.0
3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,,...,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,,...,,,,,,,,,,
5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,,...,,,,,,,,,,


By looking at the dataset's column names, it is clear that any food allergy is called "foodname_ALG_START". In order to select the columns used as food allergy, I'll use regular expressions:

In [4]:
#Selecting food allergy columns
import re

alg_col = []

for col in df.columns:
    if re.search("_ALG_START", col):
        alg_col.append(col)
df[alg_col].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333200 entries, 1 to 333200
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SHELLFISH_ALG_START  5246 non-null   float64
 1   FISH_ALG_START       1796 non-null   float64
 2   MILK_ALG_START       7289 non-null   float64
 3   SOY_ALG_START        2419 non-null   float64
 4   EGG_ALG_START        6065 non-null   float64
 5   WHEAT_ALG_START      1146 non-null   float64
 6   PEANUT_ALG_START     8653 non-null   float64
 7   SESAME_ALG_START     766 non-null    float64
 8   TREENUT_ALG_START    1 non-null      float64
 9   WALNUT_ALG_START     704 non-null    float64
 10  PECAN_ALG_START      285 non-null    float64
 11  PISTACH_ALG_START    369 non-null    float64
 12  ALMOND_ALG_START     386 non-null    float64
 13  BRAZIL_ALG_START     68 non-null     float64
 14  HAZELNUT_ALG_START   253 non-null    float64
 15  CASHEW_ALG_START     561 non-null 

Now, let's have a look at those columns to see their values

In [5]:
df[alg_col].head()

Unnamed: 0_level_0,SHELLFISH_ALG_START,FISH_ALG_START,MILK_ALG_START,SOY_ALG_START,EGG_ALG_START,WHEAT_ALG_START,PEANUT_ALG_START,SESAME_ALG_START,TREENUT_ALG_START,WALNUT_ALG_START,PECAN_ALG_START,PISTACH_ALG_START,ALMOND_ALG_START,BRAZIL_ALG_START,HAZELNUT_ALG_START,CASHEW_ALG_START
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,
3,,,1.002053,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,


It's clear that NAN means no allergy, whether the presence of an allergy can be detected by finding an actual value stricktly superior than 0. Let's create a column called food allergy, containing 1 where there is an allergy, and 0 where there isn't.

In [6]:
#Finding NAN values and replacing them with 0
df[alg_col] = df[alg_col].fillna(0)
df[alg_col].head()

Unnamed: 0_level_0,SHELLFISH_ALG_START,FISH_ALG_START,MILK_ALG_START,SOY_ALG_START,EGG_ALG_START,WHEAT_ALG_START,PEANUT_ALG_START,SESAME_ALG_START,TREENUT_ALG_START,WALNUT_ALG_START,PECAN_ALG_START,PISTACH_ALG_START,ALMOND_ALG_START,BRAZIL_ALG_START,HAZELNUT_ALG_START,CASHEW_ALG_START
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.002053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now let's sum all food allergy columns into one that we call ALG, which indicates the presence or absence of an allergy.

In [7]:
#Summing all food allergy columns into one, called ALG for food allergy
df['ALG'] = df[alg_col].sum(axis=1)
df[['ALG']].head()

Unnamed: 0_level_0,ALG
SUBJECT_ID,Unnamed: 1_level_1
1,0.0
2,0.0
3,1.002053
4,0.0
5,0.0


In [8]:
#Replacing values that are superior to 0 by 1 (meaning allergy exists)
df[['ALG']].loc[df['ALG'] > 0.0] = 1.0
df['ALG'].head()

SUBJECT_ID
1    0.000000
2    0.000000
3    1.002053
4    0.000000
5    0.000000
Name: ALG, dtype: float64

Now let's convert categorical variables to numeric

In [9]:
cat_cols = ['GENDER_FACTOR', 'RACE_FACTOR', 'ETHNICITY_FACTOR', 'PAYER_FACTOR', 'ATOPIC_MARCH_COHORT']

for col in cat_cols:
    vals = list(df[col].unique())
    df[col].replace(vals, list(range(len(vals))), inplace=True)

Keeping only significant columns ...

In [10]:
df = df[['BIRTH_YEAR', 'GENDER_FACTOR', 'RACE_FACTOR', 'ETHNICITY_FACTOR', 'PAYER_FACTOR', 'ATOPIC_MARCH_COHORT', 'AGE_START_YEARS', 'ALG']]
df

Unnamed: 0_level_0,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,ALG
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2006,0,0,0,0,0,0.093087,0.000000
2,1994,0,1,0,1,0,12.232717,0.000000
3,2006,1,1,1,1,1,0.010951,1.002053
4,2004,1,2,1,1,0,2.398357,0.000000
5,2006,0,0,0,1,0,0.013689,0.000000
...,...,...,...,...,...,...,...,...
333196,2006,1,1,0,1,0,0.736482,0.000000
333197,2006,0,0,0,0,1,0.019165,0.000000
333198,2006,1,1,0,1,0,0.443532,0.000000
333199,2006,1,4,0,1,1,0.013689,0.000000


# Phase 2 : Logistic Regression

Preparing the dataset

In [11]:
#Selecting X and Y
X = df.iloc[:, 1:7]
Y = df.iloc[:, -1].astype(int)

#Splitting the dataset into train/test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [12]:
X.head()

Unnamed: 0_level_0,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,0,0,0.093087
2,0,1,0,1,0,12.232717
3,1,1,1,1,1,0.010951
4,1,2,1,1,0,2.398357
5,0,0,0,1,0,0.013689


In [13]:
Y.head()

SUBJECT_ID
1    0
2    0
3    1
4    0
5    0
Name: ALG, dtype: int32

Creating and training the model

In [14]:
#Creating our model
model = LogisticRegression()

#Training our model
model.fit(X_train, Y_train)

LogisticRegression()

In [15]:
score = model.score(X_test, Y_test)
print('Model accuracy = '+str(round(score * 10000)/100)+' %')

Model accuracy = 93.96 %
