# 인터넷 신규 가입 여부 예측 실습 예제
- 종속 변수 : `Newbie`
- 독립 변수 제거 : `who`, `Country`, `Years on Internet`
- 결측 값 제거
  - `Age` : 평균
  - `Household Income`, `Sexual Preference`, `Marital Status` : 최빈 값으로 채우기

In [89]:
import pandas as pd
import numpy as np

In [90]:
internet_df = pd.read_csv('./exsamples/newbie.csv')
# 타겟 데이터 : Newbie

In [91]:
internet_df.drop(['who', 'Country', 'Years on Internet'], axis=1, inplace=True)

In [92]:
internet_df

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [94]:
internet_df['Age'].fillna(internet_df['Age'].mean(), inplace=True)

internet_df

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [96]:
internet_df['Household Income'].fillna(internet_df['Household Income'].value_counts().idxmax(), inplace=True)

internet_df['Sexual Preference'].fillna(internet_df['Sexual Preference'].value_counts().idxmax(), inplace=True)

internet_df['Marital Status'].fillna(internet_df['Marital Status'].value_counts().idxmax(), inplace=True)

internet_df

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [80]:
import numpy as np
import pandas as pd
from datetime import datetime

In [24]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

In [25]:
from pyrsistent import b


def loss_func(x, t):
  delta = 1e-7

  z = np.dot(x, W) + b
  y = sigmoid(z)

  return -np.sum(t * np.log(y + delta) + (1 - t) * np.log((1 - y) + delta))

In [26]:
def predict(test_data):
  z = np.dot(test_data, W) + b
  y = sigmoid(z)

  if y >= 0.5:
    result = 1
  else:
    result = 0

  return y, result

In [27]:
# 수치미분 함수(
def numerical_derivative(f, x): 
  delta_x = 1e-4 # 0.0001
  grad = np.zeros_like(x) #수치미분된 값 저장할 ndarray로 x와 같은 크기의 0으로 된 배열 생성

  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])

  while not it.finished:
    idx = it.multi_index
    tmp_val = x[idx]
    x[idx] = float(tmp_val) + delta_x
    fx1 = f(x) # f(x+delta_x)

    x[idx] = float(tmp_val) - delta_x
    fx2 = f(x) # f(x-delta_x)
    grad[idx] = (fx1 - fx2) / (2*delta_x)

    x[idx] = tmp_val
    it.iternext() # 다음 인덱스로 이동

  return grad

In [28]:
x_data = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18, 20]).reshape(10, 1)
t_data = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).reshape(10, 1)

W = np.random.rand(1, 1)
b = np.random.rand(1)

print("x_data.shape = ", x_data.shape, ", t_data.shape = ", t_data.shape)
print("W = ", W, ", W.shape = ", W.shape, ", b = ", b, ", b.shape = ", b.shape)

x_data.shape =  (10, 1) , t_data.shape =  (10, 1)
W =  [[0.3100285]] , W.shape =  (1, 1) , b =  [0.45276264] , b.shape =  (1,)


In [29]:
# 수치미분을 이용한 로지스틱 회귀(Logistic Regression) 시스템 학습과정
learning_rate = 1e-2

f = lambda x : loss_func(x_data, t_data)

print("Initial loss value = ", loss_func(x_data, t_data) )

start_time = datetime.now()

for step in range(70001):
 W -= learning_rate * numerical_derivative(f, W)
 b -= learning_rate * numerical_derivative(f, b)
 
 if (step % 5000 == 0):
  print("step = ", step, "loss value = ", loss_func(x_data, t_data))

Initial loss value =  16.406988284245053
step =  0 loss value =  8.962338979910884
step =  5000 loss value =  0.8452948481325607
step =  10000 loss value =  0.6210144344686422
step =  15000 loss value =  0.5095105264779427
step =  20000 loss value =  0.43779567333397185
step =  25000 loss value =  0.3861591687044705
step =  30000 loss value =  0.3465356180863853
step =  35000 loss value =  0.31485850281139605
step =  40000 loss value =  0.28879695356416946
step =  45000 loss value =  0.26689346609070325
step =  50000 loss value =  0.24817755196742455
step =  55000 loss value =  0.23197164648597593
step =  60000 loss value =  0.2177848932346432
step =  65000 loss value =  0.20525102313320379
step =  70000 loss value =  0.1940900396579655


In [30]:
test_data = np.array([3.0]) # 3 시간에 대한 미래 값 예측
(real_val_1, logical_val_1) = predict(test_data)
print("시그모이드 결과 =>", real_val_1, ", 로지스틱회귀 결과 =>", logical_val_1)

test_data = np.array([17.0]) # 17 시간에 대한 미래 값 예측
(real_val_2, logical_val_2) = predict(test_data)
print("시그모이드 결과 =>", real_val_2, ", 로지스틱회귀 결과 =>", logical_val_2)

시그모이드 결과 => [1.13896465e-10] , 로지스틱회귀 결과 => 0
시그모이드 결과 => [0.99990663] , 로지스틱회귀 결과 => 1


In [88]:
internet_df

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single
...,...,...,...,...,...,...,...,...
19578,0,22.0,Male,Over $100,Heterosexual,Some College,Education,Single
19579,0,19.0,Male,$50-74,Heterosexual,Some College,Education,Single
19580,0,49.0,Female,$50-74,Heterosexual,Doctoral,Education,Married
19581,1,42.0,Female,$50-74,Heterosexual,Some College,Other,Married


In [69]:
internet_df['Gender'] = internet_df['Gender'].astype('category')
internet_df['Household Income'] = internet_df['Household Income'].astype('category')
internet_df['Sexual Preference'] = internet_df['Sexual Preference'].astype('category')
internet_df['Education Attainment'] = internet_df['Education Attainment'].astype('category')
internet_df['Major Occupation'] = internet_df['Major Occupation'].astype('category')
internet_df['Marital Status'] = internet_df['Marital Status'].astype('category')

In [97]:
internet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Newbie                19583 non-null  int64  
 1   Age                   19583 non-null  float64
 2   Gender                19583 non-null  object 
 3   Household Income      19583 non-null  object 
 4   Sexual Preference     19583 non-null  object 
 5   Education Attainment  19583 non-null  object 
 6   Major Occupation      19583 non-null  object 
 7   Marital Status        19583 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [73]:
data_dummies = pd.get_dummies(internet_df)

data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19583 entries, 0 to 19582
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Newbie                             19583 non-null  int64  
 1   Age                                19583 non-null  float64
 2   Gender_Female                      19583 non-null  uint8  
 3   Gender_Male                        19583 non-null  uint8  
 4   Household Income_$10-19            19583 non-null  uint8  
 5   Household Income_$20-29            19583 non-null  uint8  
 6   Household Income_$30-39            19583 non-null  uint8  
 7   Household Income_$40-49            19583 non-null  uint8  
 8   Household Income_$50-74            19583 non-null  uint8  
 9   Household Income_$75-99            19583 non-null  uint8  
 10  Household Income_Over $100         19583 non-null  uint8  
 11  Household Income_Under $10         19583 non-null  uin

In [50]:
dummies_train = data_dummies.iloc[:, 1:]

dummies_target = data_dummies.iloc[:, 0:1]

In [51]:
dummies_train

Unnamed: 0,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,Household Income_Over $100,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,54.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,39.0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
2,49.0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,22.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,20.0,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,22.0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
19579,19.0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
19580,49.0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
19581,42.0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [52]:
dummies_target

Unnamed: 0,Newbie
0,0
1,0
2,1
3,1
4,0
...,...
19578,0
19579,0
19580,0
19581,1


In [98]:
# 모델 생성 및 학습
from sklearn.linear_model import LogisticRegression

# 학습용 데이터/검증용 데이터 생성
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(dummies_train, dummies_target, random_state=42)

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(solver='liblinear')
model.fit(X_train_scaled, y_train)

print(X_train_scaled)
print('--------------------------')

print(X_test_scaled)
print('--------------------------')

model = LogisticRegression(solver='liblinear')
model.fit(X_train_scaled, y_train)

[[0.17333333 0.         1.         ... 0.         1.         0.        ]
 [0.52       1.         0.         ... 0.         0.         0.        ]
 [0.37333333 1.         0.         ... 0.         0.         0.        ]
 ...
 [0.25333333 0.         1.         ... 0.         0.         0.        ]
 [0.49333333 0.         1.         ... 0.         0.         0.        ]
 [0.4        0.         1.         ... 0.         0.         0.        ]]
--------------------------
[[0.26666667 1.         0.         ... 0.         1.         0.        ]
 [0.40296359 0.         1.         ... 0.         0.         0.        ]
 [0.30666667 0.         1.         ... 0.         1.         0.        ]
 ...
 [0.49333333 1.         0.         ... 1.         0.         0.        ]
 [0.53333333 0.         1.         ... 0.         0.         0.        ]
 [0.26666667 1.         0.         ... 0.         0.         0.        ]]
--------------------------


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression(solver='liblinear')

In [102]:
score_train_minmax = model.score(X_train_scaled, y_train)
score_train_minmax

0.761217403145639

In [101]:
score_test_minmax = model.score(X_test_scaled, y_test)
score_test_minmax

0.7546977124183006