In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../../datasets/diabetes.csv')

In [3]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [4]:
from sklearn.model_selection import train_test_split

In [6]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [7]:
from statsmodels.api import Logit

In [17]:
model = Logit(endog=df_train.loc[:, ['Outcome']], exog=df_train.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [19]:
pred = model.predict(df_test.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']])
pred

236    0.462956
395    0.507051
36     0.359735
210    0.314389
483    0.219876
         ...   
650    0.398274
579    0.561881
119    0.281386
593    0.386267
310    0.280089
Length: 154, dtype: float64

In [22]:
pred_class = (pred > 0.5) + 0
pred_class

236    0
395    1
36     0
210    0
483    0
      ..
650    0
579    1
119    0
593    0
310    0
Length: 154, dtype: int32

In [20]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(y_true=df_test.loc[:, 'Outcome'], y_pred=pred_class).round(2)

0.7

In [32]:
model = Logit(endog=df.loc[:, ['Outcome']], exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()
model.params

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [33]:
import numpy as np

In [35]:
np.exp(model.params).round(2)

Glucose    1.01
BMI        0.96
Age        0.99
dtype: float64

In [37]:
from sklearn.metrics import roc_auc_score

In [39]:
model = Logit(endog=df.loc[:, ['Outcome']], exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()
pred = model.predict(df.loc[:, ['Glucose', 'BMI', 'Age']])
pred

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


0      0.387961
1      0.365506
2      0.615678
3      0.392087
4      0.336654
         ...   
763    0.261357
764    0.373590
765    0.453351
766    0.377879
767    0.375465
Length: 768, dtype: float64

In [42]:
roc_auc_score(y_true=df.loc[:, ['Outcome']], y_score=pred).round(2)

0.54

In [47]:
from sklearn.linear_model import LogisticRegression

In [49]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [52]:
model = LogisticRegression().fit(X=df_train.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']], y=df_train.loc[:, ['Outcome']])

  y = column_or_1d(y, warn=True)


In [53]:
pred = model.predict(df_test.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']])
pred

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
      dtype=int64)

In [58]:
accuracy_score(y_true=df_test.loc[:, 'Outcome'], y_pred=pred)

0.7792207792207793

In [60]:
model.coef_

array([[-0.00530793,  0.03730708,  0.09567454, -0.00171317]])

In [61]:
df_sub = df.loc[df['BMI'] > 0, :]
df_sub.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [62]:
print(df.size, df_sub.size)

6912 6813


In [63]:
from sklearn.naive_bayes import GaussianNB

In [70]:
model = GaussianNB().fit(X=df_sub.iloc[:, :-1], y=df_sub.iloc[:, -1])

In [75]:
model.predict(X=df_sub.iloc[:, :-1])

array([1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,

In [78]:
df_sub['Outcome'].value_counts(normalize=True).round(2)

0    0.65
1    0.35
Name: Outcome, dtype: float64

In [81]:
model = GaussianNB().fit(X=df.iloc[:, :-1], y=df.iloc[:, -1])

In [83]:
pred = model.predict(X=df.iloc[:, :-1])
pred

array([1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,

In [85]:
accuracy_score(y_true=df.iloc[:, -1], y_pred=pred).round(2)

0.76

In [104]:
df_sub = df.loc[df['BMI'] > 0, :]
df_sub.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [105]:
df_sub['agg_group'] = (df_sub['Age'] // 10) * 10
df_sub['is_preg'] = (df_sub['Pregnancies'] > 0) + 0
df_sub.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['agg_group'] = (df_sub['Age'] // 10) * 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['is_preg'] = (df_sub['Pregnancies'] > 0) + 0


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,agg_group,is_preg
0,6,148,72,35,0,33.6,0.627,50,1,50,1
1,1,85,66,29,0,26.6,0.351,31,0,30,1


In [106]:
df_sub_train, df_sub_test = train_test_split(df_sub, train_size=0.8, random_state=123)

In [107]:
gaussian_model = GaussianNB().fit(
    X=df_sub_train.loc[:, ['is_preg', 'agg_group', 'BMI', 'Glucose']],
    y=df_sub_train.loc[:, ['Outcome']]
)
logistic_model = LogisticRegression().fit(
    X=df_sub_train.loc[:, ['is_preg', 'agg_group', 'BMI', 'Glucose']],
    y=df_sub_train.loc[:, ['Outcome']]
)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [110]:
gaussian_pred = gaussian_model.predict(X=df_sub_test.loc[:, ['is_preg', 'agg_group', 'BMI', 'Glucose']])
logistic_pred = logistic_model.predict(X=df_sub_test.loc[:, ['is_preg', 'agg_group', 'BMI', 'Glucose']])

In [112]:
accuracy_score(y_true=df_sub_test.loc[:, ['Outcome']], y_pred=gaussian_pred).round(2)

0.8

In [113]:
accuracy_score(y_true=df_sub_test.loc[:, ['Outcome']], y_pred=logistic_pred).round(2)

0.83

In [114]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [115]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)

In [116]:
from sklearn.neighbors import KNeighborsClassifier

In [117]:
model = KNeighborsClassifier().fit(X=df_train.loc[:, ['Pregnancies', 'Glucose', 'BloodPressure']], y=df_train.loc[:, ['Outcome']])

  return self._fit(X, y)


In [119]:
pred = model.predict(df_test.loc[:, ['Pregnancies', 'Glucose', 'BloodPressure']])

In [121]:
accuracy_score(y_true=df_test.loc[:, ['Outcome']], y_pred=pred).round(2)

0.73

In [124]:
df_sub = df
df_sub['is_preg'] = (df_sub['Pregnancies'] > 0) + 0
df_sub.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,is_pred,is_preg
0,6,148,72,35,0,33.6,0.627,50,1,1,1
1,1,85,66,29,0,26.6,0.351,31,0,1,1


In [132]:
df_sub_train, df_sub_test = train_test_split(df_sub, train_size=0.8, random_state=123)

In [135]:
def knn(k):
    model = KNeighborsClassifier(n_neighbors=k).fit(
        X=df_sub_train.loc[:, ['is_preg', 'Glucose', 'BloodPressure', 'Insulin', 'BMI']],
        y=df_sub_train.loc[:, ['Outcome']]
    )
    pred = model.predict(df_sub_test.loc[:, ['is_preg', 'Glucose', 'BloodPressure', 'Insulin', 'BMI']])
    acc = accuracy_score(y_true=df_sub_test.loc[:, ['Outcome']], y_pred=pred).round(2)
    print(f'k = {k}, Acc. = {acc}')

In [136]:
k_list = [3, 5, 10, 20]

for k in k_list:
    knn(k)

k = 3, Acc. = 0.71
k = 5, Acc. = 0.73
k = 10, Acc. = 0.78
k = 20, Acc. = 0.76


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


In [147]:
df = pd.read_csv('../../../datasets/diabetes.csv')

In [148]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [149]:
df['is_preg'] = (df['Pregnancies'] > 0) + 0
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,is_preg
0,6,148,72,35,0,33.6,0.627,50,1,1
1,1,85,66,29,0,26.6,0.351,31,0,1


In [150]:
from sklearn.metrics import mean_squared_error

In [151]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [152]:
X_cols = ['is_preg', 'Glucose', 'BloodPressure', 'Insulin']

neighbors = [3, 5, 10, 20]
rmses = []

for n_n in neighbors:
    model = KNeighborsRegressor(n_neighbors=n_n)
    model.fit(X=df_train.loc[:, X_cols], y=df_train['BMI'])
    pred = model.predict(df_test.loc[:, X_cols])
    rmse_sub = mean_squared_error(y_pred=pred, y_true=df_test['BMI']) ** 0.5
    rmses = rmses + [rmse_sub]

df_score = pd.DataFrame({'neighbors': neighbors, 'rmses': rmses})
df_score['rmses'] = df_score['rmses'].round(3)
df_score

Unnamed: 0,neighbors,rmses
0,3,8.508
1,5,8.706
2,10,8.517
3,20,8.514


In [153]:
df = pd.read_csv('../../../datasets/diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [154]:
from sklearn.tree import DecisionTreeClassifier

In [155]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [159]:
model = DecisionTreeClassifier(random_state=123).fit(
    X=df_train.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']],
    y=df_train['Outcome']
)

In [160]:
pred = model.predict(X=df_test.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']])

In [163]:
accuracy_score(y_true=df_test['Outcome'], y_pred=pred).round(2)

0.63

In [164]:
df = pd.read_csv('../../../datasets/diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [165]:
from sklearn.tree import DecisionTreeRegressor

In [166]:
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

In [167]:
model = DecisionTreeRegressor(random_state=123).fit(
    X=df_train.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']],
    y=df_train['BMI']
)

In [168]:
pred = model.predict(X=df_test.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']])

In [171]:
(mean_squared_error(y_true=df_test['BMI'], y_pred=pred) ** 0.5).round(1)

9.9

In [172]:
df = pd.read_csv('../../../datasets/diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [173]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=345)

In [177]:
X_cols = ['Glucose', 'BloodPressure', 'Pregnancies', 'BMI', 'Age']

for i in range(3, 7):
    model = DecisionTreeClassifier(max_depth=i, random_state=345).fit(
        X=df_train.loc[:, X_cols],
        y=df_train['Outcome']
    )
    pred = model.predict(X=df_test.loc[:, X_cols])
    acc = accuracy_score(y_pred=pred, y_true=df_test['Outcome']).round(2)
    print(f'Depth {i}, 정확도: {acc}')

Depth 3, 정확도: 0.77
Depth 4, 정확도: 0.76
Depth 5, 정확도: 0.76
Depth 6, 정확도: 0.77
