In [17]:
#laplace smoothing
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class NaiveBayesClassifierLaplace:
    def __init__(self, smoothing=True):
        self.smoothing = smoothing
        self.class_probs = defaultdict(float)
        self.word_probs = defaultdict(lambda: defaultdict(float))
        self.vocab = set()
        self.total_words = 0

    def fit(self, X, y):
        class_counts = Counter(y)
        word_counts = defaultdict(lambda: defaultdict(int))
        total_words_per_class = defaultdict(int)

        for text, label in zip(X, y):
            words = text.split()
            for word in words:
                self.vocab.add(word)
                word_counts[label][word] += 1
                total_words_per_class[label] += 1

        self.total_words = len(self.vocab)

        total_docs = len(X)
        for label, count in class_counts.items():
            self.class_probs[label] = count / total_docs

        for label, word_count in word_counts.items():
            total_words_in_class = total_words_per_class[label]
            for word in self.vocab:
                self.word_probs[label][word] = (word_count[word] + 1) / (total_words_in_class + self.total_words)

    def predict(self, text):
        words = text.split()
        scores = defaultdict(float)
        for label in self.class_probs:
            score = np.log(self.class_probs[label])
            for word in words:
                if word in self.vocab:
                    score += np.log(self.word_probs[label][word])
                else:
                    oov_prob = 1 / (self.total_words + len(self.vocab))
                    score += np.log(oov_prob if oov_prob > 0 else 1e-10) 
            scores[label] = score
        return max(scores, key=scores.get)

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    
    X = data['text'].values  
    y = data['sentiment'].values  
    
    return X, y

file_path = 'C:/Projects/nlpass1/ass2/Sentiment Analysis Dataset.csv'
X, y = load_data_from_csv(file_path)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_smooth = NaiveBayesClassifierLaplace(smoothing=True)

nb_smooth.fit(X_train, y_train)

y_pred = [nb_smooth.predict(text) for text in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(nb_smooth.predict("hitler "))


Accuracy: 0.7159
3


In [25]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class NaiveBayesClassifierWithUNK:
    def __init__(self):
        self.class_probs = defaultdict(float)
        self.word_probs = defaultdict(lambda: defaultdict(float))
        self.vocab = set()
        self.total_words = 0
        self.unk_prob = defaultdict(float)  

    def fit(self, X, y):
        class_counts = Counter(y)
        word_counts = defaultdict(lambda: defaultdict(int))
        total_words_per_class = defaultdict(int)

        for text, label in zip(X, y):
            words = text.split()
            for word in words:
                self.vocab.add(word)
                word_counts[label][word] += 1
                total_words_per_class[label] += 1

        self.total_words = len(self.vocab)

        total_docs = len(X)
        for label, count in class_counts.items():
            self.class_probs[label] = count / total_docs

        for label, word_count in word_counts.items():
            total_words_in_class = total_words_per_class[label]
            for word in self.vocab:
                self.word_probs[label][word] = word_count[word] / total_words_in_class if word_count[word] > 0 else 0.0

        for label in self.class_probs:
            total_words_in_class = total_words_per_class[label]
            self.unk_prob[label] = 1e-7 

    def predict(self, text):
        words = text.split()
        scores = defaultdict(float)
        for label in self.class_probs:
            score = np.log(self.class_probs[label])
            for word in words:
                if word in self.vocab:
                    prob = self.word_probs[label][word]
                    score += np.log(prob if prob > 0 else 1e-10)
                else:
                    score += np.log(self.unk_prob[label])
            scores[label] = score
        return max(scores, key=scores.get)

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
 
    X = data['text'].values 
    y = data['sentiment'].values 
    
    return X, y

file_path = 'C:/Projects/nlpass1/ass2/Sentiment Analysis Dataset.csv'
X, y = load_data_from_csv(file_path)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_with_unk = NaiveBayesClassifierWithUNK()

nb_with_unk.fit(X_train, y_train)

y_pred = [nb_with_unk.predict(text) for text in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(nb_with_unk.predict("hitler "))  


Accuracy: 0.7121
3


In [26]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class NaiveBayesClassifierNoSmoothing:
    def __init__(self):
        self.class_probs = defaultdict(float)
        self.word_probs = defaultdict(lambda: defaultdict(float))
        self.vocab = set()
        self.total_words = 0

    def fit(self, X, y):
        class_counts = Counter(y)
        word_counts = defaultdict(lambda: defaultdict(int))
        total_words_per_class = defaultdict(int)

        for text, label in zip(X, y):
            words = text.split()
            for word in words:
                self.vocab.add(word)
                word_counts[label][word] += 1
                total_words_per_class[label] += 1

        self.total_words = len(self.vocab)

        total_docs = len(X)
        for label, count in class_counts.items():
            self.class_probs[label] = count / total_docs

        for label, word_count in word_counts.items():
            total_words_in_class = total_words_per_class[label]
            for word in self.vocab:
                self.word_probs[label][word] = word_count[word] / total_words_in_class if word_count[word] > 0 else 0.0

    def predict(self, text):
        words = text.split()
        scores = defaultdict(float)
        for label in self.class_probs:
            score = np.log(self.class_probs[label])
            for word in words:
                if word in self.vocab:
                    score += np.log(self.word_probs[label][word])
                else:
                    score += np.log(1e-10) 
            scores[label] = score
        return max(scores, key=scores.get)

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    
    X = data['text'].values  
    y = data['sentiment'].values  
    
    return X, y

file_path = 'C:/Projects/nlpass1/ass2/Sentiment Analysis Dataset.csv'
X, y = load_data_from_csv(file_path)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_no_smoothing = NaiveBayesClassifierNoSmoothing()

nb_no_smoothing.fit(X_train, y_train)

y_pred = [nb_no_smoothing.predict(text) for text in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(nb_no_smoothing.predict("hitler "))  # Example test


Accuracy: 0.6478
3


  score += np.log(self.word_probs[label][word])


Q2:

In [30]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    X = data['text'].values
    y = data['sentiment'].values
    return X, y

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

file_path = 'C:/Projects/nlpass1/ass2/Sentiment Analysis Dataset.csv'
X, y = load_data_from_csv(file_path)
X = [preprocess_text(text) for text in X]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def get_tfidf_embeddings(texts, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(texts)
    else:
        tfidf_matrix = vectorizer.transform(texts)
    return tfidf_matrix.toarray(), vectorizer

def get_word2vec_embeddings(texts, embedding_size=100):
    tokenized_texts = [text.split() for text in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=embedding_size, window=5, min_count=1, sg=0)
    
    def get_vector(text):
        words = text.split()
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if len(word_vectors) == 0:
            return np.zeros(embedding_size)
        return np.mean(word_vectors, axis=0)
    
    return np.array([get_vector(text) for text in texts])

X_train_tfidf, tfidf_vectorizer = get_tfidf_embeddings(X_train)
X_test_tfidf, _ = get_tfidf_embeddings(X_test, vectorizer=tfidf_vectorizer)

svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train_tfidf, y_train)

svm_tfidf_pred = svm_tfidf.predict(X_test_tfidf)
print("SVM with TF-IDF Accuracy:", accuracy_score(y_test, svm_tfidf_pred))

X_train_word2vec = get_word2vec_embeddings(X_train)
X_test_word2vec = get_word2vec_embeddings(X_test)

svm_word2vec = SVC(kernel='linear')
svm_word2vec.fit(X_train_word2vec, y_train)

svm_word2vec_pred = svm_word2vec.predict(X_test_word2vec)
print("SVM with Word2Vec Accuracy:", accuracy_score(y_test, svm_word2vec_pred))


SVM with TF-IDF Accuracy: 0.7634961439588689
SVM with Word2Vec Accuracy: 0.5501285347043702


In [34]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path, encoding='ISO-8859-1')
    X = data['text'].values
    y = data['sentiment'].values
    return X, y

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

file_path = 'C:/Projects/nlpass1/ass2/Sentiment Analysis Dataset.csv'
X, y = load_data_from_csv(file_path)
X = [preprocess_text(text) for text in X]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def get_tfidf_embeddings(train_texts, test_texts, max_features=5000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_vectorizer.fit(train_texts)
    X_train_tfidf = tfidf_vectorizer.transform(train_texts).toarray()
    X_test_tfidf = tfidf_vectorizer.transform(test_texts).toarray()
    return X_train_tfidf, X_test_tfidf

def get_word2vec_embeddings(texts, embedding_size=100):
    tokenized_texts = [text.split() for text in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=embedding_size, window=5, min_count=1, sg=0)
    
    def get_vector(text):
        words = text.split()
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if len(word_vectors) == 0:
            return np.zeros(embedding_size)
        return np.mean(word_vectors, axis=0)
    
    return np.array([get_vector(text) for text in texts])

X_train_tfidf, X_test_tfidf = get_tfidf_embeddings(X_train, X_test)

logreg_tfidf = LogisticRegression(max_iter=1000)
logreg_tfidf.fit(X_train_tfidf, y_train)

logreg_tfidf_pred = logreg_tfidf.predict(X_test_tfidf)
print("Logistic Regression with TF-IDF Accuracy:", accuracy_score(y_test, logreg_tfidf_pred))

X_train_word2vec = get_word2vec_embeddings(X_train)
X_test_word2vec = get_word2vec_embeddings(X_test)

logreg_word2vec = LogisticRegression(max_iter=1000)
logreg_word2vec.fit(X_train_word2vec, y_train)

logreg_word2vec_pred = logreg_word2vec.predict(X_test_word2vec)
print("Logistic Regression with Word2Vec Accuracy:", accuracy_score(y_test, logreg_word2vec_pred))


Logistic Regression with TF-IDF Accuracy: 0.7519280205655527
Logistic Regression with Word2Vec Accuracy: 0.5462724935732648
