In [449]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MaxAbsScaler
import matplotlib.pyplot as plt
import numpy as np
import random
import csv
from collections import defaultdict
import re

random.seed(42)

def load_data(file_name,labels, max_labels=100):
    data = []
    labels_counter = {}
    with open(file_name, 'r', encoding="utf-8") as file:
        first = True
        for row in csv.reader(file):
            if not first:
                label = int(row[0])-1
                if label in labels:
                    data.append(((row[1]+' '+row[2]), label))
            else:
                first = not first
    random.shuffle(data)
    X, y = [], []
    for row in data:
        label = row[1]
        if label not in labels_counter.keys():
            labels_counter[label] = 0
        if labels_counter[label] < max_labels:
            labels_counter[label]+=1
            X.append(row[0])
            y.append(label)
    return X, y

X_doc_train, y_train = load_data(r".\train.csv", labels=(0,1))
X_doc_test, y_test = load_data(r".\test.csv", labels=(0,1), max_labels=80)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [450]:
class MyTfidfVectorizer:
    def __init__(self, token_pattern, min_df=1, max_df=1.0, stop_words=None):
        self.token_pattern = token_pattern
        self.min_df = min_df
        self.max_df = max_df
        self.vocabulary = None
        self.idf = None

    def _preprocess(self, texts):
        processed_texts = []
        for text in texts:
            text = text.lower()
            words = re.findall(r"\b\w+\b", text)
            processed_texts.append(words)
        return processed_texts

    def _build_vocabulary(self, processed_texts):
        doc_count = len(processed_texts)
        word_doc_freq = defaultdict(int)
        for words in processed_texts:
            unique_words = set(words)
            for word in unique_words:
                word_doc_freq[word] += 1
        
        filtered_words = [
            word for word, freq in word_doc_freq.items() 
            if freq >= 1 and freq <= doc_count
        ]
        self.vocabulary = {
            word: idx for idx, word in enumerate(sorted(filtered_words))
        }
        self.idf = np.zeros(len(self.vocabulary))
        for word, idx in self.vocabulary.items():
            doc_freq = word_doc_freq[word]
            self.idf[idx] = np.log((doc_count + 1) / (doc_freq + 1)) + 1

    def _compute_tf(self, words):
        total_words = len(words)
        if total_words == 0:
            return {}
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1
        tf = {word: freq / total_words for word, freq in word_freq.items()}
        return tf
    def fit(self, texts):
        processed_texts = self._preprocess(texts)
        self._build_vocabulary(processed_texts)
        return self

    def transform(self, texts):
        processed_texts = self._preprocess(texts)
        n_docs = len(processed_texts)
        n_features = len(self.vocabulary)
        tfidf_matrix = np.zeros((n_docs, n_features))
        for doc_idx, words in enumerate(processed_texts):
            tf = self._compute_tf(words)
            for word, tf_value in tf.items():
                if word in self.vocabulary:
                    word_idx = self.vocabulary[word]
                    tfidf_matrix[doc_idx, word_idx] = tf_value * self.idf[word_idx]
        return tfidf_matrix

    def fit_transform(self, texts):
        return self.fit(texts).transform(texts)

In [451]:
vectorizer = MyTfidfVectorizer(token_pattern=r'\b[a-zA-Z]+\b')
X_train = vectorizer.fit_transform(X_doc_train)
X_test = vectorizer.transform(X_doc_test)

In [452]:
class MyMaxAbsScaler:
    def fit(self, X):
        self.max_abs = np.max(np.abs(X), axis=0)
        self.max_abs[self.max_abs == 0] = 1.0
        return self
    
    def transform(self, X):
        X_scaled = X / self.max_abs 
        return X_scaled
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

In [None]:
class MySVM:
    def __init__(self, C=1.0, n_iters=1000, lr=0.01, gamma=0.8, tol=1e-4,):
        self.C = C
        self.n_iters = n_iters
        self.lr = lr
        self.tol = tol
        self.scaler = MyMaxAbsScaler()
        self.gamma = gamma
    def _lr_scheduler(self, epoch):
        return self.lr*(self.gamma**epoch)

    def fit(self, X, y):
        X = self.scaler.fit_transform(X)
        n_features = X.shape[1]
        labels = np.where(y == 0, -1, 1)
        self.w =np.zeros(n_features)
        self.b = 0
        velocity_w = np.zeros_like(self.w)
        velocity_b = 0.0
        for epoch in range(self.n_iters):
            margins = labels * (np.dot(X, self.w) + self.b)
            support_vectors = margins < 1
            if np.any(support_vectors):
                grad_w = self.C * self.w - np.mean(
                    labels[support_vectors][:, np.newaxis] * X[support_vectors], axis=0
                )
                grad_b = -np.mean(labels[support_vectors])
            else:
                grad_w = self.C * self.w
                grad_b = 0
            w_prev = self.w.copy()
            b_prev = self.b
            lr = self._lr_scheduler(epoch)
            momentum = 0.9
            velocity_w = momentum * velocity_w + grad_w
            velocity_b = momentum * velocity_b + grad_b
            grad_w += momentum * velocity_w
            grad_b += momentum * velocity_b
            self.w -= lr * grad_w
            self.b -= lr * grad_b
            
            if np.linalg.norm(self.w - w_prev) < self.tol and abs(self.b - b_prev) < self.tol:
                break

    def predict(self, X):
        X = self.scaler.transform(X)
        approx = np.dot(X, self.w) + self.b
        return np.where(approx >= 0, 1, 0)

In [454]:
my_svm = MySVM()
my_svm.fit(X_train, y_train)
pred_my_svm = my_svm.predict(X_test)
print(classification_report(y_test, pred_my_svm))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        80
           1       1.00      0.99      0.99        80

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



In [455]:
sk_svm = LinearSVC()
sk_svm.fit(X_train, y_train)
pred_sk_svm = sk_svm.predict(X_test)
print(classification_report(y_test, pred_sk_svm))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98        80
           1       0.98      0.99      0.98        80

    accuracy                           0.98       160
   macro avg       0.98      0.98      0.98       160
weighted avg       0.98      0.98      0.98       160

