In [410]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import random
import csv
from collections import defaultdict
import re
import pandas as pd
import string
from sklearn.model_selection import train_test_split

def preprocess(text):
    removed_html = re.sub(r'<.*?>', '', text)
    removed_url = re.sub(r'https?://\S+|www\.\S+', '', removed_html)
    return removed_url.translate(str.maketrans('', '', string.punctuation))

df = pd.read_csv(r"IMDB Dataset.csv")
X, y = df["review"].apply(preprocess).to_numpy(), df["sentiment"].map({"positive": 1, "negative": 0}).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [411]:
class MyTfidfVectorizer:
    def __init__(
        self,
        token_pattern=r"(?u)\b\w\w+\b",
        max_features=None,
    ):
        self.token_pattern = token_pattern
        self.max_features = max_features
        self.vocabulary = None
        self.idf = None

    def _preprocess(self, texts):
        processed_texts = []
        for text in texts:
            text = text.lower()
            words = re.findall(self.token_pattern, text)
            processed_texts.append(words)
        return processed_texts

    def _build_vocabulary(self, processed_texts):
        doc_count = len(processed_texts)
        word_doc_freq = defaultdict(int)
        for words in processed_texts:
            unique_words = set(words)
            for word in unique_words:
                word_doc_freq[word] += 1
        words = list(word_doc_freq.keys())
        if self.max_features is not None and len(words) > self.max_features:
            words.sort(key=lambda word: word_doc_freq[word], reverse=True)
            words = words[: self.max_features]
        else:
            words.sort()
        self.vocabulary = {word: idx for idx, word in enumerate(words)}
        self.idf = {
            word: np.log((doc_count + 1) / (word_doc_freq[word] + 1)) + 1
            for word in self.vocabulary.keys()
        }

    def _compute_tf(self, words):
        total_words = len(words)
        if total_words == 0:
            return {}
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1
        tf = {word: freq / total_words for word, freq in word_freq.items()}
        return tf

    def fit(self, texts):
        processed_texts = self._preprocess(texts)
        self._build_vocabulary(processed_texts)
        return self

    def transform(self, texts):
        processed_texts = self._preprocess(texts)
        n_docs = len(processed_texts)
        n_features = len(self.vocabulary)
        tfidf_matrix = np.zeros((n_docs, n_features))
        for doc_idx, words in enumerate(processed_texts):
            tf = self._compute_tf(words)
            for word, tf_value in tf.items():
                if word in self.vocabulary:
                    word_idx = self.vocabulary[word]
                    tfidf_matrix[doc_idx, word_idx] = tf_value * self.idf[word]
        return tfidf_matrix

    def fit_transform(self, texts):
        return self.fit(texts).transform(texts)

In [412]:
class MyCountVectorizer:
    def __init__(
        self,
        token_pattern=r"(?u)\b\w\w+\b",
        ngram_range=(1, 2),
        max_features=None,
    ):
        self.token_pattern = token_pattern
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.vocabulary = None

    def _preprocess(self, texts):
        processed_texts = []
        for text in texts:
            text = text.lower()
            words = re.findall(self.token_pattern, text)
            ngrams_list = []
            for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
                for i in range(len(words) - n + 1):
                    ngram = "_".join(words[i : i + n])
                    ngrams_list.append(ngram)
            processed_texts.append(ngrams_list)
        return processed_texts

    def _build_vocabulary(self, processed_texts):
        word_doc_freq = defaultdict(int)
        for words in processed_texts:
            unique_words = set(words)
            for word in unique_words:
                word_doc_freq[word] += 1
        words = list(word_doc_freq.keys())
        words.sort(key=lambda word: word_doc_freq[word], reverse=True)
        if self.max_features is not None and len(words) > self.max_features:
            words = words[: self.max_features]
        self.vocabulary = {word: idx for idx, word in enumerate(words)}

    def fit(self, texts):
        processed_texts = self._preprocess(texts)
        self._build_vocabulary(processed_texts)
        return self

    def transform(self, texts):
        processed_texts = self._preprocess(texts)
        n_docs = len(processed_texts)
        n_features = len(self.vocabulary)
        count_matrix = np.zeros((n_docs, n_features))
        for doc_idx, words in enumerate(processed_texts):
            for word in words:
                if word in self.vocabulary:
                    word_idx = self.vocabulary[word]
                    count_matrix[doc_idx, word_idx] += 1
        return count_matrix

    def fit_transform(self, texts):
        return self.fit(texts).transform(texts)

In [413]:
vectorizer = MyCountVectorizer(token_pattern=r'\b[a-zA-Z]+\b', max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
X_train.shape

(40000, 5000)

In [414]:
class MyStandardScaler:
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        self.std = np.std(X, axis=0)
        self.std[self.std == 0] = 1.0
        return self
    
    def transform(self, X):
        return (X - self.mean) / self.std
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

In [415]:
class MySVM:
    def __init__(
        self,
        C=1.0,
        n_iters=1000,
        lr=0.01,
        momentum=0.9,
        gamma=0.8,
        tol=1e-4,
    ):
        self.C = C
        self.n_iters = n_iters
        self.lr = lr
        self.tol = tol
        self.scaler = MyStandardScaler()
        self.gamma = gamma
        self.momentum = momentum

    def _lr_scheduler(self, epoch):
        return self.lr * (self.gamma**epoch)

    def fit(self, X, y):
        X = self.scaler.fit_transform(X)
        n_samples, n_features = X.shape
        labels = np.where(y == 0, -1, 1)
        self.w = np.zeros(n_features)
        self.b = 0
        velocity_w = np.zeros_like(self.w)
        velocity_b = 0.0
        for epoch in range(self.n_iters):
            margins = labels * (np.dot(X, self.w) + self.b)
            support_vectors_indicator = (margins < 1).astype(float)
            grad_w = (
                self.w
                - self.C * np.dot(support_vectors_indicator * labels, X) / n_samples
            )
            grad_b = -self.C * np.sum(support_vectors_indicator * labels) / n_samples
            w_prev = self.w.copy()
            b_prev = self.b
            lr = self._lr_scheduler(epoch)
            velocity_w = self.momentum * velocity_w + grad_w
            velocity_b = self.momentum * velocity_b + grad_b
            self.w -= lr * velocity_w
            self.b -= lr * velocity_b

            if (
                np.linalg.norm(self.w - w_prev) < self.tol
                and abs(self.b - b_prev) < self.tol
            ):
                break

    def predict(self, X):
        X = self.scaler.transform(X)
        approx = np.dot(X, self.w) + self.b
        return np.where(approx >= 0, 1, 0)

In [416]:
my_svm = MySVM(lr=0.1, momentum=0.8, gamma=0.8)
my_svm.fit(X_train, y_train)
pred_my_svm = my_svm.predict(X_test)
print(classification_report(y_test, pred_my_svm))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5000
           1       0.88      0.91      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [417]:
sk_svm = LinearSVC()
sk_svm.fit(X_train, y_train)
pred_sk_svm = sk_svm.predict(X_test)
print(classification_report(y_test, pred_sk_svm))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      5000
           1       0.87      0.87      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [418]:
X_test

array([[11.,  3.,  2., ...,  0.,  0.,  0.],
       [ 7.,  4.,  4., ...,  0.,  0.,  0.],
       [18.,  6.,  8., ...,  0.,  0.,  0.],
       ...,
       [15.,  6.,  1., ...,  0.,  0.,  0.],
       [ 9.,  5.,  7., ...,  0.,  0.,  0.],
       [ 2.,  1.,  3., ...,  0.,  0.,  0.]], shape=(10000, 5000))

In [422]:
my_svm.predict(vectorizer.transform(["The movie is not good"]))

array([1])

In [420]:
vectorizer.vocabulary

{'the': 0,
 'a': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'this': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'that': 9,
 'i': 10,
 'for': 11,
 'but': 12,
 'with': 13,
 'of_the': 14,
 'was': 15,
 'as': 16,
 'on': 17,
 'movie': 18,
 'not': 19,
 'have': 20,
 'be': 21,
 'are': 22,
 'one': 23,
 'film': 24,
 'in_the': 25,
 'you': 26,
 'at': 27,
 'all': 28,
 'an': 29,
 'its': 30,
 'by': 31,
 'from': 32,
 'like': 33,
 'so': 34,
 'who': 35,
 'his': 36,
 'out': 37,
 'if': 38,
 'just': 39,
 'about': 40,
 'they': 41,
 'or': 42,
 'has': 43,
 'he': 44,
 'some': 45,
 'what': 46,
 'good': 47,
 'and_the': 48,
 'there': 49,
 'is_a': 50,
 'more': 51,
 'when': 52,
 'this_movie': 53,
 'very': 54,
 'time': 55,
 'even': 56,
 'up': 57,
 'to_be': 58,
 'only': 59,
 'to_the': 60,
 'see': 61,
 'would': 62,
 'no': 63,
 'my': 64,
 'can': 65,
 'really': 66,
 'which': 67,
 'had': 68,
 'me': 69,
 'story': 70,
 'were': 71,
 'than': 72,
 'much': 73,
 'their': 74,
 'well': 75,
 'this_is': 76,
 'the_film': 77,
 'this_film': 78,
 'get': 