<a href="https://colab.research.google.com/github/MrsIgnis/MOCI/blob/main/MOCI_task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**I. Создать нейронную сеть с нуля, т. е. не используя готовые библиотеки. Пример работы на любом табличном датасете**

In [325]:
import numpy as np
import pandas as pd

In [None]:
def sigmoid(x: np.ndarray) -> np.ndarray:
    return 1 / (1 + np.exp(-x))

In [None]:
def forward_pass(
    X: np.ndarray,
    W1: np.ndarray, b1: np.ndarray,
    W2: np.ndarray, b2: np.ndarray,
    W3: np.ndarray, b3: np.ndarray
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:

    Z1 = np.dot(X, W1) + b1
    A1 = sigmoid(Z1)

    Z2 = np.dot(A1, W2) + b2
    A2 = sigmoid(Z2)

    Z3 = np.dot(A2, W3) + b3
    A3 = sigmoid(Z3)

    return A1, A2, A3

In [None]:
def initialize_weights(
    input_neurons: int, hidden_neurons_1: int,
    hidden_neurons_2: int, output_neurons: int
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

    np.random.seed(42)
    W1 = np.random.randn(input_neurons, hidden_neurons_1) * 0.1
    b1 = np.zeros((1, hidden_neurons_1))

    W2 = np.random.randn(hidden_neurons_1, hidden_neurons_2) * 0.1
    b2 = np.zeros((1, hidden_neurons_2))

    W3 = np.random.randn(hidden_neurons_2, output_neurons) * 0.1
    b3 = np.zeros((1, output_neurons))

    return W1, b1, W2, b2, W3, b3

In [None]:
def predict(
    X: np.ndarray,
    W1: np.ndarray, b1: np.ndarray,
    W2: np.ndarray, b2: np.ndarray,
    W3: np.ndarray, b3: np.ndarray
) -> np.ndarray:

    _, _, A3 = forward_pass(X, W1, b1, W2, b2, W3, b3)
    return A3

In [None]:
dataset = np.array([
    [22, 2, 1, 1, 38000],
    [27, 3, 2, 1, 42000],
    [34, 5, 4, 2, 47000],
    [40, 8, 5, 3, 53000],
    [45, 10, 6, 4, 60000],
    [50, 12, 6, 5, 62000],
    [26, 3, 3, 2, 44000],
    [36, 7, 4, 3, 49000],
    [42, 9, 5, 4, 55000],
    [48, 11, 6, 4, 58000]
])

In [None]:
X = dataset[:, :-1]
y = dataset[:, -1].reshape(-1, 1) / 100000

In [None]:
input_neurons = X.shape[1]
hidden_neurons_1 = 10
hidden_neurons_2 = 6
output_neurons = 1

In [None]:
W1, b1, W2, b2, W3, b3 = initialize_weights(input_neurons, hidden_neurons_1, hidden_neurons_2, output_neurons)

In [None]:
predictions = predict(X, W1, b1, W2, b2, W3, b3)
predictions = predictions * 100000

In [None]:
print(f"{'Возраст':<10} {'Опыт':<10} {'Образование':<15} {'Стаж':<10} {'Доход в датасете':<20} {'Предсказанный доход':<20}")
for i in range(len(X)):
    print(f"{X[i][0]:<10} {X[i][1]:<10} {X[i][2]:<15} {X[i][3]:<10} {y[i][0]*100000:<20} {predictions[i][0]:<20.2f}")

Возраст    Опыт       Образование     Стаж       Доход в датасете     Предсказанный доход 
22         2          1               1          38000.0              46491.85            
27         3          2               1          42000.0              46498.57            
34         5          4               2          47000.0              46513.98            
40         8          5               3          53000.0              46526.87            
45         10         6               4          60000.0              46533.71            
50         12         6               5          62000.0              46539.25            
26         3          3               2          44000.0              46504.06            
36         7          4               3          49000.0              46525.35            
42         9          5               4          55000.00000000001    46533.64            
48         11         6               4          57999.99999999999    46534.30            

**II. Сделать класс, в котором реализована возможность задать количество нейронов какого-то из слоёв, и провести обучение**

In [None]:
df = pd.read_csv('/content/pokemon_dataset.csv')

In [None]:
df

Unnamed: 0,Pokemon,Level,Attack,Defense,Speed,Health
0,Bulbasaur,5,49,49,45,318
1,Ivysaur,16,62,63,60,405
2,Venusaur,32,82,83,80,525
3,Charmander,5,52,43,65,309
4,Charmeleon,16,64,58,80,405
5,Charizard,36,84,78,100,534
6,Squirtle,5,48,65,43,314
7,Wartortle,16,63,80,58,405
8,Blastoise,36,83,100,78,530
9,Pikachu,5,55,40,90,320


In [None]:
X = df.drop(['Pokemon', 'Health'], axis=1).values
y = df['Health'].values.reshape(-1, 1) / 1000
pokemon_names = df['Pokemon'].values

In [None]:
class NeuralNetwork:
    def __init__(self, input_size: int, hidden1_size: int, hidden2_size: int, output_size: int, learning_rate: float = 0.1) -> None:
        np.random.seed(42)
        self.lr = learning_rate

        self.W1 = np.random.randn(input_size, hidden1_size) * 0.1
        self.b1 = np.zeros((1, hidden1_size))

        self.W2 = np.random.randn(hidden1_size, hidden2_size) * 0.1
        self.b2 = np.zeros((1, hidden2_size))

        self.W3 = np.random.randn(hidden2_size, output_size) * 0.1
        self.b3 = np.zeros((1, output_size))

In [None]:
def sigmoid_derivative(x: np.ndarray) -> np.ndarray:
    return x * (1 - x)

In [72]:
def forward(nn: NeuralNetwork, X: np.ndarray) -> np.ndarray:
    nn.z1 = np.dot(X, nn.W1) + nn.b1
    nn.a1 = sigmoid(nn.z1)

    nn.z2 = np.dot(nn.a1, nn.W2) + nn.b2
    nn.a2 = sigmoid(nn.z2)

    nn.z3 = np.dot(nn.a2, nn.W3) + nn.b3
    nn.a3 = sigmoid(nn.z3)

    return nn.a3

In [None]:
def update_weights(nn: NeuralNetwork, X: np.ndarray, d_output: np.ndarray, d_z2: np.ndarray, d_z1: np.ndarray) -> None:
    nn.W3 += nn.a2.T.dot(d_output) * nn.lr
    nn.b3 += np.sum(d_output, axis=0, keepdims=True) * nn.lr

    nn.W2 += nn.a1.T.dot(d_z2) * nn.lr
    nn.b2 += np.sum(d_z2, axis=0, keepdims=True) * nn.lr

    nn.W1 += X.T.dot(d_z1) * nn.lr
    nn.b1 += np.sum(d_z1, axis=0, keepdims=True) * nn.lr

In [None]:
def backward(nn: NeuralNetwork, X: np.ndarray, y: np.ndarray, output: np.ndarray) -> None:
    error = y - output
    d_output = error * sigmoid_derivative(output)
    d_z2 = d_output.dot(nn.W3.T) * sigmoid_derivative(nn.a2)
    d_z1 = d_z2.dot(nn.W2.T) * sigmoid_derivative(nn.a1)

    update_weights(nn, X, d_output, d_z2, d_z1)

In [None]:
def mean_squared_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return np.mean((y_true - y_pred) ** 2)

In [None]:
def train(nn: NeuralNetwork, X: np.ndarray, y: np.ndarray, epochs: int = 1000) -> None:
    for epoch in range(epochs):
        output = forward(nn, X)
        backward(nn, X, y, output)

        loss = mean_squared_error(y, output)

        if epoch % 100 == 0:
            print(f"Epoch {epoch}/{epochs}, Loss: {loss:.4f}")

In [None]:
def predict(nn: NeuralNetwork, X: np.ndarray) -> np.ndarray:
    return forward(nn, X)

In [89]:
nn = NeuralNetwork(
    input_size=4,  # количество признаков (Level, Attack, Defense, Speed)
    hidden1_size=10,
    hidden2_size=6,
    output_size=1,  # Прогнозируем только одно значение (Health)
    learning_rate=0.5
)

In [None]:
train(nn, X, y, epochs=1000)
predictions = predict(nn, X)

Epoch 0/1000, Loss: 0.0105
Epoch 100/1000, Loss: 0.0078
Epoch 200/1000, Loss: 0.0073
Epoch 300/1000, Loss: 0.0073
Epoch 400/1000, Loss: 0.0078
Epoch 500/1000, Loss: 0.0075
Epoch 600/1000, Loss: 0.0079
Epoch 700/1000, Loss: 0.0034
Epoch 800/1000, Loss: 0.0079
Epoch 900/1000, Loss: 0.0118


In [None]:
print(f"{'Pokemon':<15} {'Level':<10} {'Attack':<10} {'Defense':<15} {'Speed':<10} {'Health in dataset':<20} {'Predicted Health':<20}")
for i in range(len(X)):
    print(f"{pokemon_names[i]:<15} {X[i][0]:<10} {X[i][1]:<10} {X[i][2]:<15} {X[i][3]:<10} {y[i][0]*1000:<20.0f} {predictions[i][0]*1000:<20.2f}")

Pokemon         Level      Attack     Defense         Speed      Health in dataset    Predicted Health    
Bulbasaur       5          49         49              45         318                  415.00              
Ivysaur         16         62         63              60         405                  415.00              
Venusaur        32         82         83              80         525                  415.00              
Charmander      5          52         43              65         309                  415.00              
Charmeleon      16         64         58              80         405                  415.00              
Charizard       36         84         78              100        534                  415.00              
Squirtle        5          48         65              43         314                  415.00              
Wartortle       16         63         80              58         405                  415.00              
Blastoise       36         83        

**III. Реализовать GPT как в пункте 2**

In [398]:
with open('/content/test_1.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print(text)

Когда скромняга бард отдыхал от дел
С Геральтом из Ривии он песню эту пел.

Сразился Белый Волк с велеречивым чертом,
Эльфов покромсал несчетные когорты.

Сзади подползли, хоть это стыд и срам,
Сломали мне лютню, дали по зубам.

Целился тот черт мне рогом прямо в глаз,
И тут Ведьмак крикнул: "Вот твой смертный час!"

Ведьмаку заплатите чеканной монетой,
Ведьмаку заплатите – зачтется все это вам!

Он хоть на край земли отправиться готов,
Сразить всех чудовищ, убить всех врагов,

Он эльфов всех прогнал за дальний перевал,
Высокие горы на вечный привал.

Он бьет не в бровь, а в глаз, был ранен много раз,
Он людям товарищ, всегда он за нас,

К чему эта вражда, никак я не пойму,
Он нас защищает – так налейте ж ему!

Ведьмаку заплатите чеканной монетой,
Ведьмаку заплатите – зачтется все это вам!


In [399]:
tokens = text.lower().replace('\n', ' ').replace('.', '').split()
vocab = sorted(set(tokens))

In [400]:
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [401]:
vocab_size = len(vocab)
embedding_dim = 10

In [402]:
np.random.seed(42)
embedding_matrix = np.random.randn(vocab_size, embedding_dim) * 0.1

In [403]:
data = [word_to_idx[word] for word in tokens]

In [405]:
X_indices, y_indices = [], []
context_size = 3
for i in range(len(data) - context_size):
    X_indices.append(data[i:i + context_size])
    y_indices.append(data[i + context_size])

In [406]:
X_indices = np.array(X_indices)
y_indices = np.array(y_indices)

In [407]:
print("Пример X[0]:", X_indices[0], [idx_to_word[i] for i in X_indices[0]])
print("Целевое слово:", y_indices[0], idx_to_word[y_indices[0]])

Пример X[0]: [38 74  2] ['когда', 'скромняга', 'бард']
Целевое слово: 56 отдыхал


In [408]:
def get_embedding_batch(batch_indices: np.ndarray, embedding_matrix: np.ndarray) -> np.ndarray:
    return embedding_matrix[batch_indices].reshape(batch_indices.shape[0], -1)

In [409]:
X_embed = get_embedding_batch(X_indices, embedding_matrix)

In [410]:
class MiniGPT:
    def __init__(self, vocab_size: int, embed_dim: int, context_size: int, learning_rate: float = 0.01):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.context_size = context_size
        self.lr = learning_rate
        np.random.seed(42)

        self.token_embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
        self.position_embeddings = np.random.randn(context_size, embed_dim) * 0.01

        self.W_q = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_k = np.random.randn(embed_dim, embed_dim) * 0.01
        self.W_v = np.random.randn(embed_dim, embed_dim) * 0.01

        self.W_out = np.random.randn(embed_dim, vocab_size) * 0.01

In [411]:
def softmax(x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [412]:
def compute_attention(x_emb, W_q, W_k, W_v, context_size, embed_dim):
    Q = x_emb @ W_q
    K = x_emb @ W_k
    V = x_emb @ W_v

    scores = Q @ K.transpose(0, 2, 1) / np.sqrt(embed_dim)
    mask = np.tril(np.ones((context_size, context_size)))
    scores = np.where(mask == 0, -1e9, scores)

    attn_weights = softmax(scores)
    context = attn_weights @ V
    context_mean = np.mean(context, axis=1)

    return context_mean, attn_weights, Q, K, V, mask

In [413]:
def compute_logits_and_loss(context_mean, W_out, target_idx):
    logits = context_mean @ W_out
    probs = softmax(logits)

    batch_size = target_idx.shape[0]
    log_probs = -np.log(probs[range(batch_size), target_idx] + 1e-9)
    loss = np.mean(log_probs)

    return logits, probs, loss

In [414]:
def compute_loss_gradient(probs: np.ndarray, target_idx: np.ndarray) -> np.ndarray:
    batch_size = probs.shape[0]
    dlogits = probs.copy()
    dlogits[np.arange(batch_size), target_idx] -= 1
    dlogits /= batch_size

    return dlogits

In [415]:
def grad_output_weights_and_context(dlogits: np.ndarray, W_out: np.ndarray, context_mean: np.ndarray, context_size: int):
    dW_out = context_mean.T @ dlogits
    dcontext_mean = dlogits @ W_out.T
    dcontext = np.repeat(dcontext_mean[:, None, :], context_size, axis=1) / context_size

    return dW_out, dcontext

In [416]:
def grad_attention_and_values(dcontext: np.ndarray, attn_weights: np.ndarray, V: np.ndarray, mask: np.ndarray, Q: np.ndarray, K: np.ndarray, embed_dim: int):
    dattn_weights = dcontext @ V.transpose(0, 2, 1)
    dV = attn_weights.transpose(0, 2, 1) @ dcontext

    dscores = dattn_weights * attn_weights * (1 - attn_weights)
    dscores = np.where(mask == 0, 0, dscores)

    dQ = dscores @ K / np.sqrt(embed_dim)
    dK = dscores.transpose(0, 2, 1) @ Q / np.sqrt(embed_dim)

    return dQ, dK, dV

In [417]:
def grad_embeddings_and_weights(x_emb: np.ndarray, dQ, dK, dV, W_q, W_k, W_v):
    dx_emb_q = dQ @ W_q.T
    dW_q = x_emb.transpose(0, 2, 1) @ dQ

    dx_emb_k = dK @ W_k.T
    dW_k = x_emb.transpose(0, 2, 1) @ dK

    dx_emb_v = dV @ W_v.T
    dW_v = x_emb.transpose(0, 2, 1) @ dV

    dx_emb = dx_emb_q + dx_emb_k + dx_emb_v

    return dx_emb, dW_q, dW_k, dW_v

In [418]:
def backward_pass(x_emb, Q, K, V, attn_weights, probs, target_idx, W_out, W_q, W_k, W_v,
                  context_size, embed_dim, mask):

    context_mean = np.mean(V, axis=1)
    dlogits = compute_loss_gradient(probs, target_idx)
    dW_out, dcontext = grad_output_weights_and_context(dlogits, W_out, context_mean, context_size)

    dQ, dK, dV = grad_attention_and_values(dcontext, attn_weights, V, mask, Q, K, embed_dim)

    dx_emb, dW_q, dW_k, dW_v = grad_embeddings_and_weights(x_emb, dQ, dK, dV, W_q, W_k, W_v)

    return dx_emb, dW_out, dW_q, dW_k, dW_v

In [419]:
def update_embeddings(x_idx, dx_emb, token_embeddings, position_embeddings, context_size, lr):
    dtoken_embeddings = np.zeros_like(token_embeddings)
    dposition_embeddings = np.zeros_like(position_embeddings)

    batch_size = x_idx.shape[0]
    for i in range(batch_size):
        for j in range(context_size):
            dtoken_embeddings[x_idx[i, j]] += dx_emb[i, j]
            dposition_embeddings[j] += dx_emb[i, j]

    token_embeddings -= lr * dtoken_embeddings
    position_embeddings -= lr * dposition_embeddings

    return token_embeddings, position_embeddings

In [420]:
def train_step(mg: MiniGPT, x_idx: np.ndarray, target_idx: np.ndarray) -> float:
    x_emb = mg.token_embeddings[x_idx] + mg.position_embeddings[np.arange(mg.context_size)]

    context_mean, attn_weights, Q, K, V, mask = compute_attention(
        x_emb, mg.W_q, mg.W_k, mg.W_v, mg.context_size, mg.embed_dim
    )

    logits, probs, loss = compute_logits_and_loss(context_mean, mg.W_out, target_idx)

    predicted_word = idx_to_word[np.argmax(probs[0])]
    actual_word = idx_to_word[target_idx[0]]
    print(f"Предсказанное слово: {predicted_word}, Реальное слово: {actual_word}")

    dx_emb, dW_out, dW_q, dW_k, dW_v = backward_pass(
        x_emb, Q, K, V, attn_weights, probs, target_idx,
        mg.W_out, mg.W_q, mg.W_k, mg.W_v,
        mg.context_size, mg.embed_dim, mask
    )

    mg.token_embeddings, mg.position_embeddings = update_embeddings(
        x_idx, dx_emb, mg.token_embeddings, mg.position_embeddings, mg.context_size, mg.lr
    )

    mg.W_out -= mg.lr * dW_out
    mg.W_q -= mg.lr * np.mean(dW_q, axis=0)
    mg.W_k -= mg.lr * np.mean(dW_k, axis=0)
    mg.W_v -= mg.lr * np.mean(dW_v, axis=0)

    return loss

In [421]:
def train_gpt_model(mg: MiniGPT, X, y, epochs=100, batch_size=32):
    for epoch in range(epochs):
        total_loss = 0
        indices = np.random.permutation(len(X))
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        print(f"\nEpoch {epoch+1} - Token Embeddings (before): {mg.token_embeddings[0][:5]}")  # первые 5 элементов
        print(f"Epoch {epoch+1} - Position Embeddings (before): {mg.position_embeddings[0][:5]}\n")

        for i in range(0, len(X_shuffled), batch_size):
            batch_X = X_shuffled[i:i+batch_size]
            batch_y = y_shuffled[i:i+batch_size]

            loss = train_step(mg, batch_X, batch_y)
            total_loss += loss

        print(f"\nEpoch {epoch+1} - Token Embeddings (after): {mg.token_embeddings[0][:5]}")  # первые 5 элементов
        print(f"Epoch {epoch+1} - Position Embeddings (after): {mg.position_embeddings[0][:5]}\n")

        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {total_loss / len(X)}\n\n", end='---'*30+'\n')


In [422]:
mg = MiniGPT(
    vocab_size,
    embed_dim=128,
    context_size=3
)

In [423]:
train_gpt_model(mg, X_gpt, y_gpt, epochs=10)


Epoch 1 - Token Embeddings (before): [ 0.00496714 -0.00138264  0.00647689  0.0152303  -0.00234153]
Epoch 1 - Position Embeddings (before): [-0.01478674 -0.02229677 -0.00719472 -0.00625107  0.00943159]

Предсказанное слово: всех, Реальное слово: раз,
Предсказанное слово: зачтется, Реальное слово: это
Предсказанное слово: волк, Реальное слово: это
Предсказанное слово: "вот, Реальное слово: защищает
Предсказанное слово: зачтется, Реальное слово: волк

Epoch 1 - Token Embeddings (after): [ 0.00496717 -0.00138252  0.00647669  0.01523071 -0.00234196]
Epoch 1 - Position Embeddings (after): [-0.01478566 -0.02229849 -0.00719055 -0.00625254  0.00943354]

Epoch 1/10, Average Loss: 0.17481506774120867

------------------------------------------------------------------------------------------

Epoch 2 - Token Embeddings (before): [ 0.00496717 -0.00138252  0.00647669  0.01523071 -0.00234196]
Epoch 2 - Position Embeddings (before): [-0.01478566 -0.02229849 -0.00719055 -0.00625254  0.00943354]

Предс