### Import thư viện

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Xây dựng mô hình

In [2]:
class LogisticRegressionFromScratch:
    def __init__(self, learning_rate = 0.01, num_iterations = 1500):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.w = None
        self.b = None
        self.costs = []
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def fit(self, X, y):
        """
        X: m samples, n features (m, n)
        y: m target values (m, 1)
        """
        m, n = X.shape
        # Initialize parameters
        self.w = np.zeros((n, 1))
        self.b = 0

        # Gradient descent loop
        for i in range(self.num_iterations):
            # Compute function f_wb, cost
            f_wb = self.sigmoid(np.dot(X, self.w) + self.b)
            cost = 1 / m * np.sum(-y * np.log(f_wb) - (1 - y) * np.log(1 - f_wb))
            
            # Compute gradient
            dz = f_wb - y
            dw = 1 / m * np.dot(X.T, dz)
            db = 1 / m * np.sum(dz)

            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db
    
            if i % 100 == 0:
                self.costs.append(cost)
                print(f"Cost at iteration {i}: {cost}")
    def predict(self, X):
        f_wb = self.sigmoid(np.dot(X, self.w) + self.b)
        predictions = (f_wb >= 0.5).astype(int)
        return predictions

### Đọc dữ liệu và tiền xử lí

In [3]:
df = pd.read_csv('Data/SAHeart.csv')

In [4]:
df.head()

Unnamed: 0,row.names,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,1,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
1,2,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,3,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,4,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
4,5,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   row.names  462 non-null    int64  
 1   sbp        462 non-null    int64  
 2   tobacco    462 non-null    float64
 3   ldl        462 non-null    float64
 4   adiposity  462 non-null    float64
 5   famhist    462 non-null    object 
 6   typea      462 non-null    int64  
 7   obesity    462 non-null    float64
 8   alcohol    462 non-null    float64
 9   age        462 non-null    int64  
 10  chd        462 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 39.8+ KB


In [6]:
df['famhist'] = df['famhist'].apply(lambda x: 1 if x == 'Present' else 0)

### Chia tập huấn luyện

In [7]:
X = df.drop('chd', axis=1).values
y = df['chd'].values.reshape(-1, 1)

In [8]:
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
split_idx = int(0.8 * len(indices))

train_idx, test_idx = indices[:split_idx], indices[split_idx:]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [9]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std


X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])


### Thực nghiệm

In [10]:
model = LogisticRegressionFromScratch(learning_rate = 0.01, num_iterations = 1500)

In [11]:
model.fit(X_train, y_train)

Cost at iteration 0: 0.6931471805599453
Cost at iteration 100: 0.599005457185238
Cost at iteration 200: 0.5638132163302929
Cost at iteration 300: 0.5469158332077263
Cost at iteration 400: 0.5374179088388812
Cost at iteration 500: 0.531538919933215
Cost at iteration 600: 0.5276645291846304
Cost at iteration 700: 0.5249975938790927
Cost at iteration 800: 0.5231021374821273
Cost at iteration 900: 0.5217214198748183
Cost at iteration 1000: 0.5206956802440316
Cost at iteration 1100: 0.5199212075704416
Cost at iteration 1200: 0.5193283953158289
Cost at iteration 1300: 0.5188692549482729
Cost at iteration 1400: 0.518509962071242


In [12]:
y_pred = model.predict(X_test)

### Đánh giá mô hình

In [13]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score,precision_score,recall_score
print('accuracy score : ',accuracy_score(y_pred,y_test))
print('precision_score : ',precision_score(y_pred,y_test))
print('recall score : ',recall_score(y_pred,y_test))
print('f1_score : ',f1_score(y_pred,y_test))
print('roc_auc_score : ',roc_auc_score(y_pred,y_test))

accuracy score :  0.7419354838709677
precision_score :  0.5714285714285714
recall score :  0.4444444444444444
f1_score :  0.5
roc_auc_score :  0.654040404040404
