<a href="https://colab.research.google.com/github/Kvazzzzar/MPSI/blob/main/MPSI_3.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Collect Data и Prepare Data
class TextDataProcessor:
    """Класс для обработки текстовых данных"""

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def load_data(self, filepath):
        """Загрузка датасета"""
        self.df = pd.read_csv(filepath)
        print(f"Loaded dataset with {len(self.df)} samples")
        return self.df

    def preprocess_text(self, text):
        """Предобработка текста"""
        # Удаление спецсимволов и цифр
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
        # Приведение к нижнему регистру
        text = text.lower()
        # Токенизация
        tokens = text.split()
        # Удаление стоп-слов и лемматизация
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        return ' '.join(tokens)

    def analyze_data(self):
        """Анализ данных"""
        print("\nData Analysis:")
        print(self.df.head())
        print("\nCategory distribution:")
        print(self.df['category'].value_counts())

    def vectorize_text(self):
        """Векторизация текста"""
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.X = self.vectorizer.fit_transform(self.df['processed_text'])
        return self.X

    def cluster_texts(self, n_clusters=5):
        """Кластеризация текстов"""
        self.kmeans = KMeans(n_clusters=n_clusters)
        self.clusters = self.kmeans.fit_predict(self.X)

        # Сравнение с реальной разметкой
        if 'category' in self.df.columns:
            labels = pd.factorize(self.df['category'])[0]
            score = adjusted_rand_score(labels, self.clusters)
            print(f"\nClustering quality (ARI): {score:.3f}")

    def split_data(self, test_size=0.2, val_size=0.1):
        """Разделение данных на train, test и val"""
        # Сначала разделяем на train+val и test
        X_temp, X_test, y_temp, y_test = train_test_split(
            self.X, self.df['category'], test_size=test_size, random_state=42)

        # Затем разделяем train+val на train и val
        val_ratio = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_ratio, random_state=42)

        print(f"\nData split:")
        print(f"Train: {X_train.shape[0]} samples")
        print(f"Val: {X_val.shape[0]} samples")
        print(f"Test: {X_test.shape[0]} samples")

        return X_train, X_val, X_test, y_train, y_val, X_test

# 2. Обучение упрощенной GPT модели
class SimpleGPTTrainer:
    """Класс для обучения упрощенной GPT модели"""

    def __init__(self, vocab_size, embedding_dim, max_seq_length):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_length = max_seq_length

        # Инициализация параметров
        self.token_emb = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.pos_emb = np.random.randn(max_seq_length, embedding_dim) * 0.01
        self.Wq = np.random.randn(embedding_dim, embedding_dim) * 0.01
        self.Wk = np.random.randn(embedding_dim, embedding_dim) * 0.01
        self.Wv = np.random.randn(embedding_dim, embedding_dim) * 0.01
        self.Wo = np.random.randn(embedding_dim, vocab_size) * 0.01

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    def attention(self, x):
        """Механизм self-attention"""
        Q = np.dot(x, self.Wq)
        K = np.dot(x, self.Wk)
        V = np.dot(x, self.Wv)

        scores = np.dot(Q, K.T) / np.sqrt(self.embedding_dim)
        weights = self.softmax(scores)
        output = np.dot(weights, V)
        return output

    def forward(self, token_ids):
        """Прямой проход"""
        token_emb = self.token_emb[token_ids]
        pos_emb = self.pos_emb[:len(token_ids)]
        x = token_emb + pos_emb
        x = self.attention(x)
        logits = np.dot(x, self.Wo)
        return logits

    def compute_loss(self, logits, targets):
        """Вычисление потерь"""
        probs = self.softmax(logits)
        loss = -np.log(probs[np.arange(len(targets)), targets]).mean()
        return loss

    def train_step(self, batch, learning_rate=0.01):
        """Один шаг обучения"""
        inputs, targets = batch[:-1], batch[1:]

        # Прямой проход
        logits = self.forward(inputs)
        loss = self.compute_loss(logits, targets)

        # Обратное распространение (упрощенное)
        # В реальной реализации здесь должно быть вычисление градиентов
        # и обновление параметров

        return loss

    def train(self, data, epochs=10, batch_size=32, learning_rate=0.01):
        """Процесс обучения"""
        for epoch in range(epochs):
            total_loss = 0
            for i in range(0, len(data)-batch_size, batch_size):
                batch = data[i:i+batch_size]
                loss = self.train_step(batch, learning_rate)
                total_loss += loss

            avg_loss = total_loss / (len(data) // batch_size)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Пример использования полного пайплайна
if __name__ == "__main__":
    # 1. Обработка данных
    processor = TextDataProcessor()

    # Загрузка датасета (пример - нужно заменить на реальный датасет)
    # dataset_url = "https://example.com/text_dataset.csv"
    # df = processor.load_data(dataset_url)

    # Создаем искусственные данные для примера
    data = {
        'text': [
            "This is a positive review about a product",
            "Negative experience with customer service",
            "The movie was great and actors performed well",
            "I didn't like the book, it was boring",
            "Excellent food and atmosphere at the restaurant"
        ],
        'category': ["positive", "negative", "positive", "negative", "positive"]
    }
    df = pd.DataFrame(data)
    processor.df = df

    # Предобработка текста
    processor.df['processed_text'] = processor.df['text'].apply(processor.preprocess_text)
    processor.analyze_data()

    # Векторизация
    X = processor.vectorize_text()

    # Кластеризация
    processor.cluster_texts(n_clusters=2)

    # Разделение данных
    X_train, X_val, X_test, y_train, y_val, y_test = processor.split_data()

    # 2. Подготовка данных для GPT
    # Создаем словарь и преобразуем тексты в последовательности токенов
    vocab = {word: idx for idx, word in enumerate(set(' '.join(processor.df['processed_text']).split()))}
    sequences = []
    for text in processor.df['processed_text']:
        tokens = text.split()
        seq = [vocab[token] for token in tokens if token in vocab]
        sequences.extend(seq)

    # 3. Обучение GPT
    gpt = SimpleGPTTrainer(
        vocab_size=len(vocab),
        embedding_dim=64,
        max_seq_length=20
    )

    print("\nTraining GPT model...")
    gpt.train(np.array(sequences), epochs=5, batch_size=8, learning_rate=0.01)


Data Analysis:
                                              text  category  \
0        This is a positive review about a product  positive   
1        Negative experience with customer service  negative   
2    The movie was great and actors performed well  positive   
3            I didn't like the book, it was boring  negative   
4  Excellent food and atmosphere at the restaurant  positive   

                         processed_text  
0               positive review product  
1  negative experience customer service  
2      movie great actor performed well  
3                didnt like book boring  
4  excellent food atmosphere restaurant  

Category distribution:
category
positive    3
negative    2
Name: count, dtype: int64

Clustering quality (ARI): 0.231

Data split:
Train: 3 samples
Val: 1 samples
Test: 1 samples

Training GPT model...
Epoch 1/5, Loss: 2.9957
Epoch 2/5, Loss: 2.9957
Epoch 3/5, Loss: 2.9957
Epoch 4/5, Loss: 2.9957
Epoch 5/5, Loss: 2.9957


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
