In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# 필요한 모듈 불러오기
!pip install autograd
from autograd import grad
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
# logistic 함수
def logistic(x):

    out = 1. / (1. + (np.exp(-x)))

    return out


# logistic 모델
def logistic_model(x, params):
    
    out = logistic(np.dot(x, params[0]) + params[1])
    return out


# 분류기
def classify(x, params):
    
    probabilities = logistic_model(x, params)
    labels = (probabilities >= 0.5).astype(float)
    return labels


# cost function
def model_loss(x, true_labels, params, _lambda=1.0):
    
    pred = logistic_model(x, params)
    
    loss = - (
        np.dot(true_labels, np.log(pred+1e-15)) +
        np.dot(1.-true_labels, np.log(1.-pred+1e-15))
    ) + _lambda * np.sum(params[0]**2)
    
    return loss


# a function to get the gradients of a logistic model
gradients = grad(model_loss, argnum=2)

In [4]:
# 평가모델
def performance(predictions, answers, beta=1.0):
    
    true_idx = (answers == 1)  # the location where the answers are 1
    false_idx = (answers == 0)  # the location where the answers are 0
    
    # true positive: answers are 1 and predictions are also 1
    n_tp = np.count_nonzero(predictions[true_idx] == 1)
    
    # false positive: answers are 0 but predictions are 1
    n_fp = np.count_nonzero(predictions[false_idx] == 1)
    
    # true negative: answers are 0 and predictions are also 0
    n_tn = np.count_nonzero(predictions[false_idx] == 0)
    
    # false negative: answers are 1 but predictions are 0
    n_fn = np.count_nonzero(predictions[true_idx] == 0)
    
    # precision, recall, and f-score
    precision = n_tp / (n_tp + n_fp)
    recall = n_tp / (n_tp + n_fn)
    score = (
        (1.0 + beta**2) * precision * recall / 
        (beta**2 * precision + recall)
    )

    accuracy = (n_tp + n_tn) / (n_tp + n_fn + n_fp + n_tn)

    return precision, recall, score, accuracy

In [5]:
# 데이터 불러오기
df_wine = pd.read_csv("../input/goorm3competition/winequality-red.csv")
df_wine_test = pd.read_csv("../input/goorm3competition/winequality-red_test.csv")
# 데이터 구조 흝어보기
df_wine.head()

In [6]:
# 데이터셋 간단한 정보 살펴보기
df_wine.info()

In [7]:
# null 체크
df_wine.isnull().sum()

In [8]:
# 숫자형 특성 탐색
df_wine.describe()

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(11, 9))
corr = df_wine.corr()
sns.heatmap(corr, cmap='Blues',annot=True)

In [10]:
# 관련도 높은 특성만 추출
corr_feat = abs(corr["quality"])

# 상관계수 1.0 이상만 인덱스로 만듬
rel_feat = corr_feat[corr_feat>0.10]
rel_feat

In [11]:
rel_feat.index

In [12]:
# 상관도 낮은 변수 제거
for i in df_wine.columns:
    if i == "quality":
        continue
    if i not in rel_feat.index:
        df_wine = df_wine.drop(i, axis = 1)
        df_wine_test = df_wine_test.drop(i, axis = 1)

In [13]:
# 예측해야되는 quality를 0과 1로 나눈다.
df_wine['quality'] = pd.cut(df_wine['quality'], bins = (0, 5, 10), labels = [0, 1])

In [14]:
# 예측하고자 하는 quality를 y로, 나머지 변수를 X로
X = df_wine.drop("quality", axis=1)
y = df_wine["quality"]

In [15]:
# Normalization
# min_max_scaler = MinMaxScaler()
# X = min_max_scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 1, shuffle=True)
# Standardization
# standard_scaler = StandardScaler()
# X = standard_scaler.fit_transform(X)

In [16]:
X_train

In [17]:
X_val

In [18]:
# calculate mu and sigma
mu = np.mean(X_train, axis=0)
sigma = np.std(X_train, axis=0)

# normalize the training, validation, and test datasets
X_train = (X_train - mu) / sigma
X_val = (X_val - mu) / sigma
X_test = (df_wine_test - mu) / sigma

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

In [19]:
# initialize parameters
w = np.zeros(X_train.shape[1], dtype=float)
b = 0.

In [20]:
# Modeling
from autograd import numpy as np

# logistic = LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=0, warm_start=False)
# logistic.fit(X,y)

# learning rate
lr = 1e-5

# a variable for the change in validation loss
change = np.inf

# a counter for optimization iterations
i = 0

# a variable to store the validation loss from the previous iteration
old_val_loss = 1e-15

# keep running if:
#   1. we still see significant changes in validation loss
#   2. iteration counter < 10000
while change >= 1e-5 and i < 10000:
    
    # calculate gradients and use gradient descents
    grads = gradients(X_train, y_train, (w, b))
    w -= (grads[0] * lr)
    b -= (grads[1] * lr)
    
    # validation loss
    val_loss = model_loss(X_val, y_val, (w, b))
    
    # calculate f-scores against the validation dataset
    pred_labels_val = classify(X_val, (w, b))
    score = performance(pred_labels_val, y_val)

    # calculate the chage in validation loss
    change = np.abs((val_loss-old_val_loss)/old_val_loss)

    # update the counter and old_val_loss
    i += 1
    old_val_loss = val_loss
    
    # print the progress every 10 steps
    if i % 10 == 0:
        print("{}...".format(i), end="")

print("")
print("")
print("Upon optimization stopped:")
print("    Iterations:", i)
print("    Validation loss:", val_loss)
print("    Validation precision:", score[0])
print("    Validation recall:", score[1])
print("    Validation F-score:", score[2])
print("    Validation Accuracy:", score[3])
print("    Change in validation loss:", change)

In [21]:
pred_labels_test = classify(X_val, (w, b))
perf = performance(pred_labels_test, y_val)

print("Final precision: {:.1f}%".format(perf[0]*100))
print("Final recall: {:.1f}%".format(perf[1]*100))
print("Final F-score: {:.1f}%".format(perf[2]*100))
print("Final Accuracy: {:.1f}%".format(perf[3]*100))

In [22]:
# Applying to the model
final = classify(X_test, (w, b))

In [23]:
import numpy as np
final = np.asarray(final, dtype = int)
final

In [24]:
# submission code
submission = pd.DataFrame(data=final)
submission.index.name='ID'
submission.index = submission.index + 1
submission.columns = ['quality']

submission.to_csv("submission.csv")

In [25]:
submission