In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("dataset/data_total_final_ml.csv")

In [5]:
data.sample(5)

Unnamed: 0,user_id,stress_total,education,age,relationship,pre_meditation_frequency,gender,race,t0_completed_date,awareness_t0_m,...,wellbeing_t2_m,days_used,total_elapsed_time,total_session_completed,prac_learn_ratio,cv_daily_elapsed_time,cv_interval,lcs_score_sum,4_week_after,8_week_after
106014,us-east-1:9994dd79-a9b3-4a4b-a2a6-50d86f4e0ada,18,,,,,,,12/01/2021,3.75,...,,,,,,,,,no,no
48972,us-east-1:4721f561-9ff7-4038-a0bc-f93f7a235ee6,16,,,,,,,05/09/2022,3.5,...,,,,,,,,,no,no
102502,us-east-1:9490d92f-831a-4030-8cca-39e48a28895d,29,,,,,,,09/16/2022,3.25,...,,,,,,,,,no,no
49227,us-east-1:477da7c8-af8b-49c5-9d79-29f4e879067b,23,,,,,,,08/18/2023,3.25,...,,,,,,,,,no,no
120639,us-east-1:ae6c9ffc-86f2-4f7a-852a-2f705e8a92ed,21,advanced_degree,25-34,single_never_married,didnt_meditate,male,mid_east_north_afr,01/24/2022,3.5,...,,,,,,,,,yes,no


## Prediction Model (Classification)

### Objective : predict whether people keep use app after 4 weeks of use (App retention)
### IV : stress_total, education, age, relationship, pre_meditation_frequency, gender, race, awareness_t0_m, insight_t0_m, purpose_t0_m, connection_t0_m, days_used, total_elapsed_time, total_session_completed, prac_learn_ratio, cv_daily_elapsed_time, cv_interval, lcs_score_sum
### DV : 4_week_after, 8_week_after
### Models : KNN, Random Forest, ANN, Logistic Regression

## Workflow of Project

![](Workflow.jpg)

## A. Data Cleaning

In [91]:
data_class1 = data[['stress_total', 'education', 'age', 'relationship', 'pre_meditation_frequency', 'gender', 'race', 
                    'wellbeing_t0_m', 'wellbeing_t1_m',
                    'days_used', 'total_elapsed_time', 'total_session_completed', 'prac_learn_ratio', 'cv_daily_elapsed_time',
                    'cv_interval', 'lcs_score_sum', '4_week_after', '8_week_after']]

In [93]:
data_class1.shape

(176875, 18)

In [95]:
data_class1.dtypes

stress_total                  int64
education                    object
age                          object
relationship                 object
pre_meditation_frequency     object
gender                       object
race                         object
wellbeing_t0_m              float64
wellbeing_t1_m              float64
days_used                   float64
total_elapsed_time          float64
total_session_completed     float64
prac_learn_ratio            float64
cv_daily_elapsed_time       float64
cv_interval                 float64
lcs_score_sum               float64
4_week_after                 object
8_week_after                 object
dtype: object

In [97]:
data_class1['4_week_after'].value_counts(normalize = True)

4_week_after
no     0.81293
yes    0.18707
Name: proportion, dtype: float64

In [99]:
data_class1['8_week_after'].value_counts(normalize = True)

8_week_after
no     0.904079
yes    0.095921
Name: proportion, dtype: float64

In [101]:
data_class1['4_week_after'] = data_class1['4_week_after'].replace({"yes":1,
                                                                  "no":0})
data_class1['8_week_after'] = data_class1['8_week_after'].replace({"yes":1,
                                                                  "no":0})

  data_class1['4_week_after'] = data_class1['4_week_after'].replace({"yes":1,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_class1['4_week_after'] = data_class1['4_week_after'].replace({"yes":1,
  data_class1['8_week_after'] = data_class1['8_week_after'].replace({"yes":1,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_class1['8_week_after'] = data_class1['8_week_after'].replace({"yes":1,


In [27]:
{col: data_class1[col].unique() for col in data_class1.select_dtypes(include='object').columns}

{'education': array([nan, 'advanced_degree', 'high_school', 'bachelors_degree',
        'some_college_or_associates', 'prefer_not_to_say',
        'some_high_school', 'middle_school', 'none', 'primary_school'],
       dtype=object),
 'age': array([nan, '35-44', '25-34', '55-64', '45-54', 'prefer_not_to_say',
        '19-24', '65-74', '75_or_older', '18-24'], dtype=object),
 'relationship': array([nan, 'married_domestic_part', 'separated', 'committed',
        'single_never_married', 'divorced', 'prefer_not_to_say', 'widowed'],
       dtype=object),
 'pre_meditation_frequency': array([nan, 'didnt_meditate', '1-2', 'less_than_once', '3-4',
        'prefer_not_to_say', '5-6', '7_or_more'], dtype=object),
 'gender': array([nan, 'male', 'female', 'prefer_not_to_say', 'fluid_nonbinary',
        'other'], dtype=object),
 'race': array([nan, 'white', 'other', 'hisp_latin_span', 'asian',
        'prefer_not_to_say', 'mid_east_north_afr', 'nat_hi_pac_isl',
        'am_indian_ak_native', 'black_a

### 1. Dimensionality reduction of categorical variables

In [103]:
data_class1['education'] = data_class1['education'].replace({"advanced_degree":"college_or_higher",
                                                             "bachelors_degree":"college_or_higher",
                                                            "some_college_or_associates":"college_or_higher",
                                                            "high_school":"lower_than_college",
                                                            "some_high_school":"lower_than_college",
                                                            "middle_school":"lower_than_college",
                                                            "primary_school":"lower_than_college",
                                                            "none":"lower_than_college"})

data_class1['age'] = data_class1['age'].replace({"75_or_older":"55-",
                                                 "65-74":"55-",
                                                 "35-44":"35-54",
                                                 "25-34":"18-34",
                                                 "55-64":"55-",
                                                 "45-54":"35-54",
                                                 "19-24":"18-34",
                                                 "55-64":"55-",
                                                 "18-24":"18-34"
                                                })

data_class1['relationship'] = data_class1['relationship'].replace({"married_domestic_part":"having_partner",
                                                                  "single_never_married":"single",
                                                                  "committed":"having_partner",
                                                                  "divorced":"single",
                                                                  "separated":"single",
                                                                  "widowed":"single"})

data_class1['pre_meditation_frequency'] = data_class1['pre_meditation_frequency'].replace({"didnt_meditate":"no_experience",
                                                                                          "less_than_once":"no_experience",
                                                                                          "1-2":"experienced",
                                                                                          "3-4":"experienced",
                                                                                          "5-6":"experienced",
                                                                                          "7_or_more":"experienced"})
data_class1['gender'] = data_class1['gender'].replace({"female":"female",
                                                      "male":"male",
                                                      "fluid_nonbinary":"other"})

data_class1['race'] = data_class1['race'].replace({"asian":"asian",
                                                  "white":"white",
                                                  "hisp_latin_span":"hisp",
                                                  "mid_east_north_afr":"black",
                                                  "black_afr_amer":"black",
                                                  "am_indian_ak_native":"other",
                                                  "nat_hi_pac_isl":"other"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_class1['education'] = data_class1['education'].replace({"advanced_degree":"college_or_higher",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_class1['age'] = data_class1['age'].replace({"75_or_older":"55-",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_class1['relationship'] = data_

In [183]:
{col: data_class1[col].unique() for col in data_class1.select_dtypes(include='object').columns}

{'education': array([nan, 'college_or_higher', 'lower_than_college',
        'prefer_not_to_say'], dtype=object),
 'age': array([nan, '35-54', '18-34', '55-', 'prefer_not_to_say'], dtype=object),
 'relationship': array([nan, 'having_partner', 'single', 'prefer_not_to_say'], dtype=object),
 'pre_meditation_frequency': array([nan, 'no_experience', 'experienced', 'prefer_not_to_say'],
       dtype=object),
 'gender': array([nan, 'male', 'female', 'prefer_not_to_say', 'other'], dtype=object),
 'race': array([nan, 'white', 'other', 'hisp', 'asian', 'prefer_not_to_say',
        'black'], dtype=object)}

In [203]:
data_class1.sample(5)

Unnamed: 0,stress_total,education,age,relationship,pre_meditation_frequency,gender,race,wellbeing_t0_m,wellbeing_t1_m,days_used,total_elapsed_time,total_session_completed,prac_learn_ratio,cv_daily_elapsed_time,cv_interval,lcs_score_sum,4_week_after,8_week_after
121945,22,lower_than_college,18-34,single,no_experience,male,hisp,13.333333,,,,,,,,,0,0
29116,27,,,,,,,12.166667,,,,,,,,,0,0
27650,19,college_or_higher,35-54,having_partner,experienced,female,asian,14.75,,,,,,,,,0,1
33288,22,college_or_higher,55-,having_partner,no_experience,female,white,13.083333,,,,,,,,,0,0
68301,28,,,,,,,12.416667,,,,,,,,,0,0


In [209]:
data_class1.to_csv("dataset/modeling_data.csv", index=False)

In [105]:
data_class1['4_week_after'].value_counts(normalize = True)

4_week_after
0    0.81293
1    0.18707
Name: proportion, dtype: float64

## Cleaning Missing Data

In [107]:
missing_ratio = data_class1.isnull().mean() * 100
print(missing_ratio.sort_values(ascending=False))

cv_interval                 90.807067
cv_daily_elapsed_time       90.368339
prac_learn_ratio            90.053993
lcs_score_sum               88.617385
days_used                   88.611731
total_elapsed_time          88.611731
total_session_completed     88.611731
wellbeing_t1_m              81.293004
race                        63.419929
gender                      63.287067
pre_meditation_frequency    63.201131
education                   63.194912
age                         63.134417
relationship                63.132155
wellbeing_t0_m               3.687915
4_week_after                 0.000000
stress_total                 0.000000
8_week_after                 0.000000
dtype: float64


In [205]:
## Omit rows whose missing rate exceeds 30% of all columns
row_na_ratio = data_class1.isna().mean(axis=1)
data_cleaned = data_class1[row_na_ratio <= 0.4]

In [207]:
missing_ratio = data_cleaned.isnull().mean() * 100
print(missing_ratio.sort_values(ascending=False))

cv_interval                 45.855673
cv_daily_elapsed_time       43.418029
prac_learn_ratio            41.922808
lcs_score_sum               34.819674
days_used                   34.796364
total_elapsed_time          34.796364
total_session_completed     34.796364
race                        12.028373
gender                      11.908488
pre_meditation_frequency    11.735323
age                         11.731992
relationship                11.728662
education                   11.728662
wellbeing_t1_m               0.999034
wellbeing_t0_m               0.702654
4_week_after                 0.000000
stress_total                 0.000000
8_week_after                 0.000000
dtype: float64


In [133]:
data_cleaned['4_week_after'].value_counts(normalize=True)

4_week_after
1    0.99001
0    0.00999
Name: proportion, dtype: float64

In [135]:
data_cleaned['8_week_after'].value_counts(normalize=True)

8_week_after
0    0.662493
1    0.337507
Name: proportion, dtype: float64

In [137]:
data_cleaned.shape

(30029, 18)

In [125]:
data_cleaned.corr(numeric_only=True)

Unnamed: 0,stress_total,wellbeing_t0_m,wellbeing_t1_m,days_used,total_elapsed_time,total_session_completed,prac_learn_ratio,cv_daily_elapsed_time,cv_interval,lcs_score_sum,4_week_after,8_week_after
stress_total,1.0,-0.361544,-0.330775,-0.058204,-0.029999,-0.029076,0.018452,0.045414,0.056455,-0.027981,-0.007081,0.002036
wellbeing_t0_m,-0.361544,1.0,0.728169,-0.006246,-0.006847,0.00183,-0.008717,0.002033,0.023769,0.004933,0.012945,0.007645
wellbeing_t1_m,-0.330775,0.728169,1.0,0.13263,0.113842,0.124833,0.009035,0.001601,0.009013,0.133045,,0.048357
days_used,-0.058204,-0.006246,0.13263,1.0,0.793075,0.83135,0.132051,-0.01361,-0.2041,0.845421,-0.019674,0.240455
total_elapsed_time,-0.029999,-0.006847,0.113842,0.793075,1.0,0.90755,0.131931,0.094919,0.117388,0.862361,-0.018234,0.188444
total_session_completed,-0.029076,0.00183,0.124833,0.83135,0.90755,1.0,0.0945,0.116092,0.17671,0.955224,-0.021631,0.191894
prac_learn_ratio,0.018452,-0.008717,0.009035,0.132051,0.131931,0.0945,1.0,0.011927,-0.00926,0.087995,0.001239,0.028106
cv_daily_elapsed_time,0.045414,0.002033,0.001601,-0.01361,0.094919,0.116092,0.011927,1.0,0.275524,0.118812,-0.020252,-0.010202
cv_interval,0.056455,0.023769,0.009013,-0.2041,0.117388,0.17671,-0.00926,0.275524,1.0,0.145761,0.000218,-0.05679
lcs_score_sum,-0.027981,0.004933,0.133045,0.845421,0.862361,0.955224,0.087995,0.118812,0.145761,1.0,-0.023666,0.214232


## Train_test Split

In [139]:
# split datsets into helidin/heldout set
from sklearn.model_selection import train_test_split
X = data_cleaned.drop(['8_week_after', '4_week_after'], axis='columns')
y = data_cleaned['8_week_after']
X_heldin, X_heldout, y_heldin, y_heldout = train_test_split(X,y,test_size=0.2, random_state = 112)

In [141]:
# split heldin datsets into train, valid sets
X_train, X_valid, y_train, y_valid = train_test_split(X_heldin,y_heldin, test_size=0.2, random_state = 113)

In [143]:
print(X_train.shape)
print(X_valid.shape)
print(X_heldout.shape)
print(y_train.shape)
print(y_valid.shape)
print(y_heldout.shape)

(19218, 16)
(4805, 16)
(6006, 16)
(19218,)
(4805,)
(6006,)


## Multiple Imputation(MICE)

In [145]:
import miceforest as mf

In [147]:
for col in X_train:
    if X_train[col].dtypes == 'object':
        X_train[col] = X_train[col].astype("category")

for col in X_valid:
    if X_valid[col].dtypes == 'object':
        X_valid[col] = X_valid[col].astype("category")

In [149]:
X_train_fix = X_train.reset_index(drop=True).copy()
y_train_fix = y_train.reset_index(drop=True).copy()

In [151]:
kernel = mf.ImputationKernel(
    data = X_train_fix,
    num_datasets=10,
    save_all_iterations_data=True,
    random_state = 1,
    mean_match_candidates=0)

In [153]:
# index 일치
np.mean(X_train_fix.index.to_numpy() == y_train_fix.index.to_numpy())

1.0

In [155]:
kernel.mice(iterations=5, verbose = True)

Initialized logger with name MICE Iterations 1 - 5 and 4 levels
1 Dataset 0
 | cv_interval | cv_daily_elapsed_time | prac_learn_ratio | lcs_score_sum | days_used | total_elapsed_time | total_session_completed | race | gender | pre_meditation_frequency | age | education | relationship | wellbeing_t1_m | wellbeing_t0_m
Dataset 1
 | cv_interval | cv_daily_elapsed_time | prac_learn_ratio | lcs_score_sum | days_used | total_elapsed_time | total_session_completed | race | gender | pre_meditation_frequency | age | education | relationship | wellbeing_t1_m | wellbeing_t0_m
Dataset 2
 | cv_interval | cv_daily_elapsed_time | prac_learn_ratio | lcs_score_sum | days_used | total_elapsed_time | total_session_completed | race | gender | pre_meditation_frequency | age | education | relationship | wellbeing_t1_m | wellbeing_t0_m
Dataset 3
 | cv_interval | cv_daily_elapsed_time | prac_learn_ratio | lcs_score_sum | days_used | total_elapsed_time | total_session_completed | race | gender | pre_meditation

In [157]:
X_train_imp = [kernel.complete_data(dataset=i) for i in range(10)]

In [223]:
for i in range(10):
    print((X_train_imp[i].index == y_train_fix.index).all())

True
True
True
True
True
True
True
True
True
True


In [161]:
# Make 1 Imputed validation set
X_valid_fix=X_valid.reset_index(drop=True).copy()
y_valid_fix=y_valid.reset_index(drop=True).copy()

X_valid_imp = kernel.impute_new_data(X_valid_fix).complete_data(0)
X_valid_imp = pd.DataFrame(X_valid_imp, columns=X_valid_fix.columns, index=X_valid_fix.index)

In [163]:
print((X_valid_imp.index == y_valid_fix.index).all())

True


## One hot encoding

In [165]:
cat_columns = ['education', 'age', 'relationship', 'pre_meditation_frequency', 'gender', 'race']

# train set
for i in range(10):
    # one hot encoding for categorical variables
    X_train_imp[i] = pd.get_dummies(data=X_train_imp[i], columns = cat_columns, drop_first = True)
    
    for col in X_train_imp[i].columns:
        if X_train_imp[i][col].dtypes == 'bool':
            X_train_imp[i][col] = X_train_imp[i][col].astype(int)

# validation set
X_valid_imp = pd.get_dummies(data=X_valid_imp, columns = cat_columns, drop_first = True)
for col in X_valid_imp:
    if X_valid_imp[col].dtypes == 'bool':
        X_valid_imp[col] = X_valid_imp[col].astype(int)

## Standard Scaling

In [167]:
from sklearn.preprocessing import StandardScaler
scaling_column = ['wellbeing_t1_m', 'wellbeing_t0_m', 
                 'days_used', 'total_elapsed_time', 'total_session_completed', 'prac_learn_ratio', 'cv_daily_elapsed_time',
                 'cv_interval', 'lcs_score_sum']
X_train_imp_scaled = []
scalers = []

# train set
for i in range(10):
    scaler = StandardScaler()
    Xt = X_train_imp[i].copy()
    Xt[scaling_column] = scaler.fit_transform(Xt[scaling_column])
    X_train_imp_scaled.append(Xt)
    scalers.append(scaler)

# validation set
X_valid_imp_scaled = X_valid_imp.copy()
X_valid_imp_scaled[scaling_column] = scaler.transform(X_valid_imp_scaled[scaling_column])

## 4. Fit the Model (KNN)

In [169]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from pandas.api.types import is_bool_dtype

In [171]:
knn = KNeighborsClassifier()
pipe = Pipeline([('knn', knn)])
k_values = list(range(1,20))
param_grid = {'knn__n_neighbors':k_values}
full_cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring = 'accuracy')

In [193]:
knn_matrix = []

for i in range(10):

    # Full Cross Validation Grid Search for finding best K-value
    full_cv_classifier.fit(X_train_imp_scaled[i], y_train_fix)
    best_k_value = full_cv_classifier.best_params_['knn__n_neighbors']

    # Create KNN model and fit the model
    knn_model = KNeighborsClassifier(n_neighbors = best_k_value)
    knn_model.fit(X_train_imp_scaled[i], y_train_fix)
    knn_y_preds = knn_model.predict(X_valid_imp_scaled)

    # Performance metrics (validation set)
    knn_metric = classification_report(y_valid_fix, knn_y_preds, output_dict = True)
    accuracy = knn_metric['accuracy']
    knn_matrix.append({'imputation':i, 'k_values':best_k_value, 'accuracy':accuracy})

In [195]:
knn_matrix

[{'imputation': 0, 'k_values': 18, 'accuracy': 0.6457856399583767},
 {'imputation': 1, 'k_values': 18, 'accuracy': 0.6472424557752341},
 {'imputation': 2, 'k_values': 18, 'accuracy': 0.6449531737773153},
 {'imputation': 3, 'k_values': 18, 'accuracy': 0.6505723204994797},
 {'imputation': 4, 'k_values': 18, 'accuracy': 0.6464099895941727},
 {'imputation': 5, 'k_values': 18, 'accuracy': 0.6462018730489074},
 {'imputation': 6, 'k_values': 18, 'accuracy': 0.6509885535900104},
 {'imputation': 7, 'k_values': 18, 'accuracy': 0.6493236212278877},
 {'imputation': 8, 'k_values': 16, 'accuracy': 0.6505723204994797},
 {'imputation': 9, 'k_values': 18, 'accuracy': 0.6449531737773153}]

## 5. Fit the Model (Random Forest)

In [90]:
from sklearn.ensemble import RandomForestClassifier

In [94]:
rf_matrix = []

n_estimators = [128, 256, 512]
max_features = ["sqrt", "log2", 2]
param_grid_rf = {'n_estimators': n_estimators, 'max_features': max_features}
                
rfc = RandomForestClassifier(random_state = 110, n_jobs = -1, bootstrap = True)
grid = GridSearchCV(rfc, param_grid_rf)

for i in range(10):
    grid.fit(X_train_imp[i], y_train_fix)
    best_est = grid.best_params_['n_estimators']
    best_feature = grid.best_params_['max_features']

    # Create RF model and fit the model
    rf_model = RandomForestClassifier(n_estimators = best_est, max_features = best_feature, 
                                      n_jobs = -1, bootstrap = True)
    rf_model.fit(X_train_imp[i], y_train_fix)
    rf_y_preds = rf_model.predict(X_valid_imp)

    # Performance metrics (validation set)
    rf_metric = classification_report(y_valid_fix, rf_y_preds, output_dict = True)
    accuracy = rf_metric['accuracy']
    rf_matrix.append({'imputation':i, 'n_estimators':best_est, 'max_features': best_feature, 'accuracy':accuracy})

In [96]:
rf_matrix

[{'imputation': 0,
  'n_estimators': 256,
  'max_features': 'sqrt',
  'accuracy': 0.5571236559139785},
 {'imputation': 1,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.558083717357911},
 {'imputation': 2,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.5535714285714286},
 {'imputation': 3,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.557315668202765},
 {'imputation': 4,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.5577956989247311},
 {'imputation': 5,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.5551075268817204},
 {'imputation': 6,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.5564516129032258},
 {'imputation': 7,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accuracy': 0.5600038402457758},
 {'imputation': 8,
  'n_estimators': 256,
  'max_features': 'sqrt',
  'accuracy': 0.5524193548387096},
 {'imputation': 9,
  'n_estimators': 512,
  'max_features': 'sqrt',
  'accu

## 6. Logistic Regression

In [177]:
from sklearn.linear_model import LogisticRegressionCV

In [179]:
log_matrix = []

for i in range(10):
    log_model = LogisticRegressionCV()
    log_model.fit(X_train_imp_scaled[i], y_train_fix)
    preds = log_model.predict(X_valid_imp_scaled)
    log_metric = classification_report(y_valid_fix, preds, output_dict = True)
    accuracy = log_metric['accuracy']
    log_matrix.append({'imputation':i, 'accuracy':accuracy})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [181]:
log_matrix

[{'imputation': 0, 'accuracy': 0.6609781477627471},
 {'imputation': 1, 'accuracy': 0.6588969823100936},
 {'imputation': 2, 'accuracy': 0.6618106139438086},
 {'imputation': 3, 'accuracy': 0.6586888657648283},
 {'imputation': 4, 'accuracy': 0.6582726326742976},
 {'imputation': 5, 'accuracy': 0.659105098855359},
 {'imputation': 6, 'accuracy': 0.6626430801248699},
 {'imputation': 7, 'accuracy': 0.659105098855359},
 {'imputation': 8, 'accuracy': 0.660353798126951},
 {'imputation': 9, 'accuracy': 0.6593132154006244}]

## Campare three model

In [104]:
y_valid.value_counts(normalize=True)

4_week_after
0    0.59255
1    0.40745
Name: proportion, dtype: float64

In [106]:
y_train.value_counts(normalize=True)

4_week_after
0    0.593586
1    0.406414
Name: proportion, dtype: float64

## 8. Feature Analysis (RF)

In [546]:
X_train_tensor[0].shape[0]

113200