Создать нейронную сеть с нуля, т.е. не используя готовые библиотеки. Пример работы на любом табличном датасете. 
Сделать класс, в котором реализована возможность задать количество нейронов в скрытом слое и провести обучение.

In [406]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Загружаем датасет с данными о заболеваниях сердца, делим на обучающую и тестовую выборку, в качестве целевой переменной беру Disease (0 - нет патологии, 1 - есть). Так же выполнила one-hot encoding и нормализацию

In [407]:
def load_dataset():
    df = pd.read_csv('heart_disease.csv')
    df_encoded = pd.get_dummies(df, drop_first=True)

    X = df_encoded.drop('Disease', axis=1).values
    y = df_encoded['Disease'].values.reshape(-1, 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(f"\nРазмер обучающей выборки: {X_train.shape}")
    print(f"Размер тестовой выборки: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test, df.columns.tolist()

In [408]:
# y = X·W + b
def linear_regression(X: np.ndarray, weights: np.ndarray, bias: float) -> np.ndarray:
    return np.dot(X, weights) + bias

# f(x) = 1 / (1 + exp(-x))
def activation_function(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))

# производная
def activation_derivative(x: np.ndarray) -> np.ndarray:
    fx = activation_function(x)
    return fx * (1 - fx)


Определение нейросети

In [409]:
def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

In [410]:
class SimpleNeuron:
    def __init__(self, input_size: int, hidden_neurons: int = 0):
        self.input_size = input_size
        self.hidden_neurons = hidden_neurons
        scale = np.sqrt(2.0 / input_size)
        
        if hidden_neurons > 0:
            self.W1 = np.random.randn(input_size, hidden_neurons) * scale
            self.b1 = np.zeros((1, hidden_neurons))
            self.W2 = np.random.randn(hidden_neurons, 1) * np.sqrt(2.0 / hidden_neurons)
            self.b2 = 0.0
        else:
            self.weights = np.random.randn(input_size) * scale
            self.bias = 0.0
        
    def forward(self, X: np.ndarray) -> np.ndarray:
        self.X = X
        
        if self.hidden_neurons > 0:
            self.z1 = np.dot(X, self.W1) + self.b1
            self.a1 = activation_function(self.z1)
            
            self.z2 = np.dot(self.a1, self.W2) + self.b2
            self.output = activation_function(self.z2)
        else:
            self.linear_output = np.dot(X, self.weights) + self.bias
            self.output = activation_function(self.linear_output)
        
        if len(self.output.shape) == 1:
            self.output = self.output.reshape(-1, 1)
            
        return self.output
    
    def backward(self, y_true: np.ndarray, learning_rate: float = 0.01) -> float:
        if len(y_true.shape) == 1:
            y_true = y_true.reshape(-1, 1)

        m = self.X.shape[0]

        error = y_true - self.output
        
        if self.hidden_neurons > 0:
            delta2 = error * activation_derivative(self.z2)
            dW2 = np.dot(self.a1.T, delta2) / m
            db2 = np.mean(delta2, axis=0)

            delta1 = np.dot(delta2, self.W2.T) * activation_derivative(self.z1)
            dW1 = np.dot(self.X.T, delta1) / m
            db1 = np.mean(delta1, axis=0)

            self.W2 += learning_rate * dW2
            self.b2 += learning_rate * db2
            self.W1 += learning_rate * dW1
            self.b1 += learning_rate * db1.reshape(1, -1)
        else:

            delta = error * activation_derivative(self.linear_output)

            dW = np.zeros_like(self.weights)
            for i in range(m):
                dW += self.X[i] * delta[i, 0]
            dW /= m
            
            db = np.mean(delta)

            self.weights += learning_rate * dW
            self.bias += learning_rate * db

        return np.mean(error ** 2)
    
    def train(self, X: np.ndarray, y: np.ndarray, epochs: int = 1000, learning_rate: float = 0.01) -> list:
        loss_history = []
        
        for epoch in range(epochs):
            self.forward(X)
            loss = self.backward(y, learning_rate)
            loss_history.append(loss)
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, loss: {loss:.6f}")
                
        return loss_history
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        return self.forward(X)

In [411]:
def test_simple_neuron(X_train, X_test, y_train, y_test, hidden_neurons=0):

    print(f"{hidden_neurons} нейронов в скрытом слое\n")

    input_size = X_train.shape[1]
    neuron = SimpleNeuron(input_size=input_size, hidden_neurons=hidden_neurons)

    loss_history = neuron.train(X_train, y_train, epochs=1000, learning_rate=0.01)

    predictions = neuron.predict(X_test)
    predicted_classes = (predictions > 0.5).astype(int)

    accuracy = np.mean(predicted_classes == y_test)

    tp = np.sum((predicted_classes == 1) & (y_test == 1))
    tn = np.sum((predicted_classes == 0) & (y_test == 0))
    fp = np.sum((predicted_classes == 1) & (y_test == 0))
    fn = np.sum((predicted_classes == 0) & (y_test == 1))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}\n")
    
    for i in range(10):
        print(f"Real {y_test[i][0]}")
        print(f"Predictions {predictions[i][0]:.4f}")
        print(f"Predicted {predicted_classes[i][0]}\n")
    
    return accuracy, precision, recall, f1

In [412]:
X_train, X_test, y_train, y_test, column_names = load_dataset()
simple_neuron_metrics = test_simple_neuron(X_train, X_test, y_train, y_test, 50)


Размер обучающей выборки: (216, 13)
Размер тестовой выборки: (54, 13)
50 нейронов в скрытом слое

Epoch 0, loss: 0.343753
Epoch 100, loss: 0.300684
Epoch 200, loss: 0.274297
Epoch 300, loss: 0.257428
Epoch 400, loss: 0.244494
Epoch 500, loss: 0.233454
Epoch 600, loss: 0.223699
Epoch 700, loss: 0.215016
Epoch 800, loss: 0.207282
Epoch 900, loss: 0.200392

Accuracy: 0.8148
Precision: 0.7895
Recall: 0.7143
F1-score: 0.7500

Real 1
Predictions 0.5818
Predicted 1

Real 1
Predictions 0.4927
Predicted 0

Real 0
Predictions 0.2818
Predicted 0

Real 0
Predictions 0.4209
Predicted 0

Real 0
Predictions 0.5085
Predicted 1

Real 1
Predictions 0.4156
Predicted 0

Real 0
Predictions 0.2646
Predicted 0

Real 0
Predictions 0.3174
Predicted 0

Real 0
Predictions 0.5583
Predicted 1

Real 0
Predictions 0.4070
Predicted 0



Реализовать GPT как в п.2 

In [413]:
embedding_dim = 128
attention_dim = 32
seq_len = 64
drop_rate = 0.1
learning_rate = 0.01
epochs = 20

In [414]:
def tokenize_text(text):
    return text.split()

def create_input_target_sequences(sequences):
    input_sequences, target_sequences = [], []
    for sentence in sequences:
        mid = len(sentence) // 2
        input_sequences.append(np.array(sentence[:mid]))
        target_sequences.append(np.array(sentence[mid:]))
    return input_sequences, target_sequences

In [415]:
text = """Он благополучно избегнул встречи с своею хозяйкой на лестнице. Каморка его приходилась под самою кровлей высокого пятиэтажного дома и походила более на шкаф, чем на квартиру. Квартирная же хозяйка его, у которой он нанимал эту каморку с обедом и прислугой, помещалась одною лестницей ниже, в отдельной квартире, и каждый раз, при выходе на улицу, ему непременно надо было проходить мимо хозяйкиной кухни, почти всегда настежь отворенной на лестницу. И каждый раз молодой человек, проходя мимо, чувствовал какое-то болезненное и трусливое ощущение, которого стыдился и от которого морщился. Он был должен кругом хозяйке и боялся с нею встретиться.
"""

sentences = text.strip().replace("\n", " ").split(". ")
words = list(set(" ".join(sentences).split())) 
words.sort()

word_to_index = {word: idx for idx, word in enumerate(words)}
index_to_word = {idx: word for word, idx in word_to_index.items()}
vocab_size = len(word_to_index)

sequences = [[word_to_index[word] for word in sentence.split()] for sentence in sentences]
input_sequences, target_sequences = create_input_target_sequences(sequences)

input_seqs_np = [seq.reshape(1, len(seq), 1) for seq in input_sequences]
target_seqs_np = [seq.reshape(1, len(seq), 1) for seq in target_sequences]

In [416]:
class GPTHead:
    def __init__(self, embedding_dim, attention_dim, seq_len, drop_rate, vocab_size):
        self.embedding_dim = embedding_dim
        self.attention_dim = attention_dim
        self.seq_len = seq_len
        self.drop_rate = drop_rate
        self.vocab_size = vocab_size
        
        init_scale = 0.02 
        self.key_matrix = np.random.randn(embedding_dim, attention_dim) * init_scale
        self.query_matrix = np.random.randn(embedding_dim, attention_dim) * init_scale
        self.value_matrix = np.random.randn(embedding_dim, attention_dim) * init_scale
        self.output_projection = np.random.randn(attention_dim, embedding_dim) * init_scale
        
        self.mask_matrix = np.tril(np.ones((seq_len, seq_len)))
        self.embeddings = np.random.randn(vocab_size, embedding_dim) * init_scale

        self.loss_history = []

    def attention_forward(self, input_tensor, training=True):
        B, T, C = input_tensor.shape
        T_actual = min(T, self.seq_len)

        key_proj = np.dot(input_tensor, self.key_matrix)
        query_proj = np.dot(input_tensor, self.query_matrix)
        value_proj = np.dot(input_tensor, self.value_matrix)
        
        scores = np.matmul(query_proj, key_proj.transpose(0, 2, 1)) * (self.attention_dim ** -0.5)
        scores = np.where(self.mask_matrix[:T_actual, :T_actual] == 0, float('-inf'), scores)

        scores_max = np.max(scores, axis=-1, keepdims=True)
        exp_scores = np.exp(scores - scores_max)
        attention_weights = exp_scores / (np.sum(exp_scores, axis=-1, keepdims=True) + 1e-10)

        if training:
            dropout_mask = (np.random.rand(*attention_weights.shape) > self.drop_rate) / (1 - self.drop_rate)

        context = np.matmul(attention_weights, value_proj)
        output_tensor = np.dot(context, self.output_projection)

        self.cache = {
            'key_proj': key_proj,
            'query_proj': query_proj,
            'value_proj': value_proj,
            'attn_weights': attention_weights,
            'context': context
        }
        
        return output_tensor

    def forward(self, input_tensor, training=True):
        return self.attention_forward(input_tensor, training)

    def calculate_loss(self, output_tensor, target_indices):
        B, T, _ = output_tensor.shape
        target_indices = target_indices.squeeze(axis=-1)
        target_embeddings = np.zeros((B, T, self.embedding_dim))
        
        for b in range(B):
            for t in range(T):
                if t < len(target_indices[b]):
                    target_embeddings[b, t] = self.embeddings[target_indices[b, t]]

        loss = np.mean(np.sum((output_tensor - target_embeddings) ** 2, axis=-1))
        return loss, target_embeddings

    def train_step(self, input_tensor, target_tensor, learning_rate):
        output_tensor = self.forward(input_tensor, training=True)

        loss, target_embeddings = self.calculate_loss(output_tensor, target_tensor)
        self.loss_history.append(loss)

        grad_output = (output_tensor - target_embeddings) / (target_embeddings.size + 1e-10)
        context = self.cache['context']

        grad_output_projection = np.dot(context.reshape(-1, self.attention_dim).T,
                                        grad_output.reshape(-1, self.embedding_dim))
        
        mean_grad = np.mean(grad_output, axis=0)
        mean_input = np.mean(input_tensor, axis=0)
        
        grad_key = np.dot(mean_input.T, mean_grad)
        grad_query = np.dot(mean_input.T, mean_grad)
        grad_value = np.dot(mean_input.T, mean_grad)
        
        self.output_projection -= learning_rate * grad_output_projection
        self.key_matrix -= learning_rate * grad_key[:, :self.attention_dim]
        self.query_matrix -= learning_rate * grad_query[:, :self.attention_dim]
        self.value_matrix -= learning_rate * grad_value[:, :self.attention_dim]
        
        return loss

    def predict_next_words(self, input_tensor, num_words=3):
        predictions = []
        current_input = input_tensor.copy()
        
        for _ in range(num_words):
            output = self.forward(current_input, training=False)
            last_token_embedding = output[0, -1]

            embeddings_norm = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
            last_token_norm = np.linalg.norm(last_token_embedding)

            if last_token_norm == 0 or np.any(embeddings_norm == 0):
                similarities = np.zeros(self.vocab_size)
            else:
                cos_similarities = np.dot(self.embeddings, last_token_embedding) / (embeddings_norm * last_token_norm)
                similarities = cos_similarities.flatten()

            predicted_idx = np.argmax(similarities)
            
            if predicted_idx >= len(index_to_word) or predicted_idx < 0:
                predicted_idx = 0
                
            predicted_word = index_to_word[predicted_idx]
            predictions.append(predicted_word)
                
            return predictions
    
    def show_prediction_examples(self, batch_idx):
        input_seq = input_seqs_np[batch_idx]
        target_seq = target_seqs_np[batch_idx]

        input_embeddings = np.zeros((1, input_seq.shape[1], self.embedding_dim))
        for t in range(input_seq.shape[1]):
            input_embeddings[0, t] = self.embeddings[input_seq[0, t, 0]]
        
        predictions = self.predict_next_words(input_embeddings, 3)
        target_words = [index_to_word[target_seq[0, i, 0]] for i in range(min(3, target_seq.shape[1]))]
        input_words = [index_to_word[input_seq[0, i, 0]] for i in range(min(5, input_seq.shape[1]))]
        
        print("\nПример:")
        print(f"Контекст: {' '.join(input_words)}...")
        print(f"Предсказанное слово: {' '.join(predictions)}")
        print(f"Реальные слова: {' '.join(target_words)}")
        print("-" * 50)

In [417]:
input_embeddings = []
for seq in input_seqs_np:
    embedding_seq = np.zeros((1, seq.shape[1], embedding_dim))
    for i in range(seq.shape[1]):
        embedding_seq[0, i] = np.random.randn(embedding_dim) * 0.1 
    input_embeddings.append(embedding_seq)

gpt_head = GPTHead(embedding_dim, attention_dim, seq_len, drop_rate, vocab_size)

for epoch in range(epochs):
    total_loss = 0
    
    for i in range(len(input_embeddings)):
        loss = gpt_head.train_step(input_embeddings[i], target_seqs_np[i], learning_rate)
        total_loss += loss

for i in range(min(3, len(input_embeddings))):
    gpt_head.show_prediction_examples(i)

print("\nLoss:")
for i, loss in enumerate(gpt_head.loss_history[::len(input_embeddings)]):
    print(f"Epoch {i+1}: {loss:.8f}")


Пример:
Контекст: Он благополучно избегнул встречи...
Предсказанное слово: И
Реальные слова: с своею хозяйкой
--------------------------------------------------

Пример:
Контекст: Каморка его приходилась под самою...
Предсказанное слово: И
Реальные слова: дома и походила
--------------------------------------------------

Пример:
Контекст: Квартирная же хозяйка его, у...
Предсказанное слово: И
Реальные слова: и каждый раз,
--------------------------------------------------

Loss:
Epoch 1: 0.05202415
Epoch 2: 0.05202413
Epoch 3: 0.05202410
Epoch 4: 0.05202408
Epoch 5: 0.05202406
Epoch 6: 0.05202404
Epoch 7: 0.05202401
Epoch 8: 0.05202399
Epoch 9: 0.05202397
Epoch 10: 0.05202395
Epoch 11: 0.05202392
Epoch 12: 0.05202390
Epoch 13: 0.05202388
Epoch 14: 0.05202386
Epoch 15: 0.05202383
Epoch 16: 0.05202381
Epoch 17: 0.05202379
Epoch 18: 0.05202376
Epoch 19: 0.05202374
Epoch 20: 0.05202372
