# Задачи
1. Самостоятельно реализовать BoW, TF-IDF
2. Решить задачу классификации с понижением размерности. Использовать самостоятельно реализованные модели из предыдущих ЛР.
3. Решить задачу мягкой кластеризации (ТМ) с помощью LDA

In [1]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    return words

def create_bag_of_words(corpus):
    word_freq = {}
    for doc in corpus:
        words = preprocess_text(doc)
        for word in words:
            if word not in word_freq:
                word_freq[word] = 0
            word_freq[word] += 1

    unique_words = list(word_freq.keys())

    bag_of_words = []
    for doc in corpus:
        words = preprocess_text(doc)
        vector = [0] * len(unique_words)
        for i, word in enumerate(unique_words):
            if word in words:
                vector[i] = words.count(word)
        bag_of_words.append(vector)

    return bag_of_words, unique_words

documents = ["Великолепный сериал, который поможет успокоить нервы при любых стрессах и просто скрасит серые будни",
         "Пожалуй, если бы я посмотрел только первые пару сезонов этого сериала, я бы с легкой руки написал ему положительную рецензию",
         "В общем, если создатели этого сериала не вернут всё на круги своя, то рейтинги следующих сезонов будут становится все ниже и ниже, а зрительская аудитория будет все меньше и меньше."]



bag_of_words, unique_words = create_bag_of_words(documents)

print("Мешок слов:")
for vector in bag_of_words:
    print(vector)

print("\nУникальные слова:")
print(unique_words)


Мешок слов:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2]

Уникальные слова:
['великолепный', 'сериал', 'который', 'поможет', 'успокоить', 'нервы', 'при', 'любых', 'стрессах', 'и', 'просто', 'скрасит', 'серые', 'будни', 'пожалуй', 'если', 'бы', 'я', 'посмотрел', 'только', 'первые', 'пару', 'сезонов', 'этого', 'сериала', 'с', 'легкой', 'руки', 'написал', 'ему', 'положительную', 'рецензию', 'в', 'общем', 'создатели', 'не', 'вернут', 'всё', 'на', 'круги', 'своя', 'то', 'рейтинги', 'следующих', 'будут', 'становится', 'все', 'ниже', 'а', 'зрительская', 'аудитория',

In [2]:
import math

def calculate_tf(term, document):
    word_count = len(document.split())
    term_count = document.split().count(term)
    tf = term_count / word_count
    return tf

def calculate_idf(term, documents):
    document_count = len(documents)
    term_occurrences = sum(1 for document in documents if term in document)
    idf = math.log((document_count + 1) / (1 + term_occurrences)) + 1
    return idf

def calculate_tfidf(term, document, documents):
    tf = calculate_tf(term, document)
    idf = calculate_idf(term, documents)
    tfidf = tf * idf
    return tfidf

def calculate_tfidf_for_documents(documents):
    tfidf_documents = []
    for document in documents:
        tfidf_document = {}
        document_terms = document.split()
        for term in document_terms:
            tfidf_document[term] = calculate_tfidf(term, document, documents)
        tfidf_documents.append(tfidf_document)
    return tfidf_documents

documents =["Великолепный сериал, который поможет успокоить нервы при любых стрессах и просто скрасит серые будни",
         "Пожалуй, если бы я посмотрел только первые пару сезонов этого сериала, я бы с легкой руки написал ему положительную рецензию",
         "В общем, если создатели этого сериала не вернут всё на круги своя, то рейтинги следующих сезонов будут становится все ниже и ниже, а зрительская аудитория будет все меньше и меньше."]

tfidf_documents = calculate_tfidf_for_documents(documents)
for i, document in enumerate(tfidf_documents):
    print(f"TF-IDF for document {i+1}:")
    for term, tfidf in document.items():
        print(f"{term}: {tfidf}")
    print()

TF-IDF for document 1:
Великолепный: 0.12093908432571038
сериал,: 0.12093908432571038
который: 0.12093908432571038
поможет: 0.12093908432571038
успокоить: 0.12093908432571038
нервы: 0.12093908432571038
при: 0.12093908432571038
любых: 0.12093908432571038
стрессах: 0.12093908432571038
и: 0.07142857142857142
просто: 0.12093908432571038
скрасит: 0.12093908432571038
серые: 0.12093908432571038
будни: 0.12093908432571038

TF-IDF for document 2:
Пожалуй,: 0.08465735902799727
если: 0.06438410362258905
бы: 0.1287682072451781
я: 0.1287682072451781
посмотрел: 0.08465735902799727
только: 0.08465735902799727
первые: 0.08465735902799727
пару: 0.08465735902799727
сезонов: 0.06438410362258905
этого: 0.06438410362258905
сериала,: 0.08465735902799727
с: 0.05
легкой: 0.08465735902799727
руки: 0.08465735902799727
написал: 0.08465735902799727
ему: 0.08465735902799727
положительную: 0.08465735902799727
рецензию: 0.08465735902799727

TF-IDF for document 3:
В: 0.04292273574839269
общем,: 0.05643823935199818
ес

In [3]:
import pandas as pd
import numpy as np
def get_matrix(document):
    matrix = []
    result = {}
    for d in document:
        result.update(d)
    unique_words = list(result.keys())
    for words in document:
        vector = [0] * len(unique_words)
        for i, word in enumerate(unique_words):
            if word in words:
                vector[i] = result[word]
        matrix.append(vector)
    return matrix


f = get_matrix(tfidf_documents)
data = pd.DataFrame(f)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.120939,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,...,0.056438,0.112876,0.056438,0.056438,0.033333,0.056438,0.056438,0.056438,0.056438,0.056438


In [4]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('D:/ML/data/spam.csv', encoding='latin-1')
y = df["v1"]
X = list(df["v2"])

tfidf_documents = calculate_tfidf_for_documents(X)
for i, document in enumerate(tfidf_documents):
    print(f"TF-IDF for document {i+1}:")
    for term, tfidf in document.items():
        print(f"{term}: {tfidf}")
    print()

FileNotFoundError: [Errno 2] No such file or directory: 'D:/ML/data/spam.csv'

In [None]:
matr = get_matrix(tfidf_documents)
data = pd.DataFrame(matr)
data

In [None]:
maping = {
    "ham" : 1,
    "spam": 0
}

y = y.replace(maping)

# 2 часть


In [None]:
class MetricsClassification:
    @staticmethod
    def accuracy(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return len([x for x, y  in zip(y_true, predictions) if x  == y])/len(y_true)
    
    @staticmethod
    def confusion_matrix(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        TP = FP =TN =FN = 0
        for test, pred in zip(y_true, predictions):
            if (test == 1 and pred == 1):
                TP += 1 
            elif (test == 0 and pred == 0):
                TN += 1
            elif (test == 1 and pred == 0):
                FN += 1
            elif (test == 0 and pred == 1):
                FP += 1
        return [[TP, FP],
                         [FN, TN]]
    @staticmethod
    def precision( y_test, y_pred):
        matrix = MetricsClassification.confusion_matrix(y_test, y_pred)
        TP = matrix[0][0]
        FP = matrix[0][1]
        return TP/(TP + FP)
    @staticmethod
    def recall(y_test, y_pred):
        matrix = MetricsClassification.confusion_matrix(y_test, y_pred)
        TP = matrix[0][0]
        FN = matrix[1][0]
        return TP/(TP + FN)
    
    @staticmethod
    def f_score(y_test, y_pred):
        recall_score = MetricsClassification.recall(y_test, y_pred)
        precision_score = MetricsClassification.precision(y_test, y_pred)
        return 2*(recall_score * precision_score)/ (recall_score+precision_score)

In [None]:
def pca(X, num_components):
    # Центрирование данных
    X_meaned = X - np.mean(X, axis=0)
    print("1")
    # Вычисление ковариационной матрицы
    cov_matrix = np.cov(X_meaned, rowvar=False)
    print("2")
    # Вычисление собственных значений и собственных векторов
    eigen_values, eigen_vectors = np.linalg.eigh(cov_matrix)
    print("3")
    # Сортировка собственных значений в убывающем порядке
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalues = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:, sorted_index]
    print("4")
    # Выбор нужного числа главных компонент
    eigenvector_subset = sorted_eigenvectors[:, 0:num_components]
    print("5")
    # Проецирование данных на главные компоненты
    X_reduced = np.dot(eigenvector_subset.transpose(), X_meaned.transpose()).transpose()
    print("6")
    return X_reduced

In [None]:
X = pca(data, 100)
print("pca complete...")

In [None]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            indices = np.argsort(distances)[:self.k]
            k_nearest_labels = np.array(self.y_train)[indices]
            most_common_label = np.bincount(k_nearest_labels).argmax()
            predictions.append(most_common_label)
        return predictions


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=4)
knn = KNN(10)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(MetricsClassification.accuracy(y_test, predictions))
print(MetricsClassification.confusion_matrix(y_test, predictions))
print(MetricsClassification.precision(y_test, predictions))
print(MetricsClassification.recall(y_test, predictions))
print(MetricsClassification.f_score(y_test, predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

# Задание 3

In [None]:
from time import time
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
df = pd.read_csv('D:/ML/data/spam.csv', encoding='latin-1')
y = df["v1"]
X = df["v2"]
data_samples = X

In [None]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

In [None]:
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=20, 
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)

In [None]:
print_top_words(lda, tf_vectorizer.get_feature_names_out(), 10)