In [265]:
import numpy as np
import pandas as pd
import os

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from functools import reduce

from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [236]:
panic = pd.read_csv("../data_processed/panic.csv")
panic.columns=panic.columns.str.lower()
panic = panic.drop_duplicates(subset=["id", "date"], keep='first')
panic['date'] = pd.to_datetime(panic['date'], format = '%Y-%m-%d')

In [237]:
folder_path = "../data_processed/daily_lifelog"
df={}
files = []
for i in os.listdir(folder_path):
    file=str(i).rstrip(".csv")
    print(file)
    file=pd.read_csv(folder_path+"/"+i)
    file = file.drop_duplicates(subset=["id", "date"], keep='first')
    file['date'] = file['date'].astype(str).str[:10]
    file['date'] = pd.to_datetime(file['date'], format = '%Y-%m-%d')
    print(file.shape)
    file.columns = file.columns.str.lower().str.strip()
    files.append(file)
files.append(panic)

caffeine
(2153, 3)
diary
(364, 3)
drinking
(486, 3)
emotion_diary
(6256, 8)
illuminance
(7721, 3)
sleep
(5887, 11)
smoking_eating_mensuration
(1343, 10)
workout
(1232, 3)


In [238]:
df = reduce(lambda left, right: pd.merge(left, right, on=['id',"date"],how="left"), files)
df.shape

(3007, 31)

In [239]:
print(df["mood"].unique())
df["mood"]=np.where((df["mood"].isna()),"Normal",df["mood"])
print(df["mood"].unique())

[nan 'Irritated' 'Grief' 'Fine' 'Depressed' 'Excitement' 'Anger' 'Joy'
 'Happiness' '후회']
['Normal' 'Irritated' 'Grief' 'Fine' 'Depressed' 'Excitement' 'Anger'
 'Joy' 'Happiness' '후회']


In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          3007 non-null   object        
 1   date                        3007 non-null   datetime64[ns]
 2   total_caffeine              3007 non-null   float64       
 3   mood                        3007 non-null   object        
 4   total_alcohol_consumption   247 non-null    float64       
 5   positive mood               2204 non-null   float64       
 6   negative mood               2204 non-null   float64       
 7   positive energy             2204 non-null   float64       
 8   negative energy             2204 non-null   float64       
 9   anxiety                     2204 non-null   float64       
 10  irritability                2204 non-null   float64       
 11  measures                    2191 non-null   float64     

In [242]:
df[df.select_dtypes(include="float64").columns] = df.select_dtypes(include="float64").fillna(0)

In [204]:
df["total_alcohol_consumption"]=df["total_alcohol_consumption"].fillna(0)
df["total_caffeine"]=df["total_caffeine"].fillna(0)
df["amount smoked"]=df["amount smoked"].fillna(0)
df["workout duration (minutes)"]=df["workout duration (minutes)"].fillna(0)
df[["positive mood","negative mood","positive energy","negative energy","anxiety","irritability"]]=df[["positive mood","negative mood","positive energy","negative energy","anxiety","irritability","measures","unknown sleep","wake"]].fillna(0)
df.iloc[:,14:-1]=df.iloc[:,14:-1].fillna("N")
df["panic"]=df["panic"].fillna(0)

1           N
2           N
3           N
4           N
        ...  
3002    277.0
3003      0.0
3004    132.0
3005      0.0
3006    132.0
Name: light sleep_1, Length: 3007, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:,14:-1]=df.iloc[:,14:-1].fillna("N")
1           N
2           N
3           N
4           N
        ...  
3002    554.0
3003      0.0
3004    317.0
3005      0.0
3006    317.0
Name: light sleep_2, Length: 3007, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:,14:-1]=df.iloc[:,14:-1].fillna("N")
1           N
2           N
3           N
4           N
        ...  
3002    347.0
3003      0.0
3004    176.0
3005      0.0
3006    176.0
Name: deep sleep, Length: 3007, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:,14:-1]=df.iloc[:,14:-1].fillna("N")
1              N
2    

In [244]:
df.set_index(["id","date"],inplace=True)

In [228]:
encoder = LabelEncoder()
df["mood"]=encoder.fit_transform(df["mood"])

In [246]:
scaler = MinMaxScaler()
df[["total_alcohol_consumption","total_caffeine","workout duration (minutes)"]]=scaler.fit_transform(df[["total_alcohol_consumption","total_caffeine","workout duration (minutes)"]])

In [247]:
df.isna().sum()

total_caffeine                   0
mood                             0
total_alcohol_consumption        0
positive mood                    0
negative mood                    0
positive energy                  0
negative energy                  0
anxiety                          0
irritability                     0
measures                         0
unknown sleep                    0
wake                             0
light sleep_1                    0
light sleep_2                    0
deep sleep                       0
sleep_duration                   0
sleep_efficiency                 0
sleep_in                         0
sleep_out                        0
amount smoked                    0
breakfast                     1860
lunch                         1860
dinner                        1860
morning snack                 1860
afternoon snack               1860
midnight snacks               1860
menstruation                  1860
workout duration (minutes)       0
panic               

In [248]:
df.select_dtypes("object").columns

Index(['mood', 'breakfast', 'lunch', 'dinner', 'morning snack',
       'afternoon snack', 'midnight snacks', 'menstruation'],
      dtype='object')

In [249]:
df=pd.get_dummies(df,columns=df.select_dtypes("object").columns.tolist())

In [250]:
df.isna().sum()

total_caffeine                0
total_alcohol_consumption     0
positive mood                 0
negative mood                 0
positive energy               0
negative energy               0
anxiety                       0
irritability                  0
measures                      0
unknown sleep                 0
wake                          0
light sleep_1                 0
light sleep_2                 0
deep sleep                    0
sleep_duration                0
sleep_efficiency              0
sleep_in                      0
sleep_out                     0
amount smoked                 0
workout duration (minutes)    0
panic                         0
mood_Anger                    0
mood_Depressed                0
mood_Excitement               0
mood_Fine                     0
mood_Grief                    0
mood_Happiness                0
mood_Irritated                0
mood_Joy                      0
mood_Normal                   0
mood_후회                       0
breakfas

In [256]:
X=df.drop(columns="panic")
y=df["panic"]

smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y = smote.fit_resample(X, y)
def fit(model):
    score = cross_val_score(model,X,y,scoring="roc_auc_ovr",cv=5)
    return score.mean()

In [257]:
fit(LogisticRegression())

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

np.float64(0.7135969356577477)

In [262]:
fit(GaussianNB())

np.float64(0.7930771860399607)

In [275]:
fit(XGBClassifier(booster="dart",max_depth=3,learning_rate=0.1,min_child_weight=0.4))

np.float64(0.9477996794530759)