In [None]:
# Model For finding wheather the mail is spam or not
# Name :- Kothamasu Jayachandra
# Roll Number :- 2110110293


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download('punkt')

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/Nlp/combined_data.csv")

# Split the dataset into features and labels
X = data["text"]
y = data["label"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# a) Binary BoWs
binary_vectorizer = CountVectorizer(binary=True)
X_train_binary = binary_vectorizer.fit_transform(X_train)
X_test_binary = binary_vectorizer.transform(X_test)

# Train Random Forest on Binary BoWs
rf_binary = RandomForestClassifier()
rf_binary.fit(X_train_binary, y_train)
y_pred_binary = rf_binary.predict(X_test_binary)
binary_accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy with Binary BoWs:", binary_accuracy)

# b) tf-idf BoWs
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Random Forest on tf-idf BoWs
rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)
tfidf_accuracy = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy with tf-idf BoWs:", tfidf_accuracy)

# c) Word vectors - Word2Vec
word_tokenized_text = [word_tokenize(text.lower()) for text in data['text']]
word2vec_model = Word2Vec(sentences=word_tokenized_text, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(word_tokenized_text, total_examples=len(word_tokenized_text), epochs=10)

# d) Word vectors - tf-idf weighted average
# Fit tf-idf vectorizer on the training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary to map each word to its tf-idf weight
word_to_tfidf = dict(zip(feature_names, tfidf_vectorizer.idf_))

# Function to calculate tf-idf weighted average of word vectors for a document
def calculate_weighted_average(words, word_vectors, word_to_tfidf):
    word_vector_dim = word_vectors.vector_size  # Use vector_size attribute instead of shape
    weighted_average = np.zeros(word_vector_dim)
    total_weight = 0
    for word in words:
        if word in word_to_tfidf and word in word_vectors:
            weighted_average += word_vectors[word] * word_to_tfidf[word]
            total_weight += word_to_tfidf[word]
    if total_weight != 0:
        weighted_average /= total_weight
    return weighted_average


# Calculate tf-idf weighted average for each document in the training set
X_train_weighted_average = []
for text in X_train:
    words = word_tokenize(text.lower())
    weighted_average = calculate_weighted_average(words, word2vec_model.wv, word_to_tfidf)
    X_train_weighted_average.append(weighted_average)

# Convert to numpy array
X_train_weighted_average = np.array(X_train_weighted_average)

# Train Random Forest on tf-idf weighted average of word vectors
rf_weighted_average = RandomForestClassifier()
rf_weighted_average.fit(X_train_weighted_average, y_train)

# Calculate tf-idf weighted average for each document in the test set and predict
X_test_weighted_average = []
for text in X_test:
    words = word_tokenize(text.lower())
    weighted_average = calculate_weighted_average(words, word2vec_model.wv, word_to_tfidf)
    X_test_weighted_average.append(weighted_average)

# Convert to numpy array
X_test_weighted_average = np.array(X_test_weighted_average)

# Predict
y_pred_weighted_average = rf_weighted_average.predict(X_test_weighted_average)
weighted_average_accuracy = accuracy_score(y_test, y_pred_weighted_average)
print("Accuracy with tf-idf weighted average of word vectors:", weighted_average_accuracy)

# e) Document vectors using Doc2Vec
documents = [TaggedDocument(words=word_tokenize(text.lower()), tags=[str(i)]) for i, text in enumerate(data['text'])]
doc2vec_model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4, epochs=10)

# Train Random Forest on Doc2Vec representations
doc_vectors = [doc2vec_model.infer_vector(word_tokenize(text.lower())) for text in X_train]
X_train_doc2vec = pd.DataFrame(doc_vectors)
rf_doc2vec = RandomForestClassifier()
rf_doc2vec.fit(X_train_doc2vec, y_train)

# Infer vectors for test data and predict
doc_vectors_test = [doc2vec_model.infer_vector(word_tokenize(text.lower())) for text in X_test]
X_test_doc2vec = pd.DataFrame(doc_vectors_test)
y_pred_doc2vec = rf_doc2vec.predict(X_test_doc2vec)
doc2vec_accuracy = accuracy_score(y_test, y_pred_doc2vec)
print("Accuracy with Doc2Vec:", doc2vec_accuracy)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy with Binary BoWs: 0.9807070101857399
Accuracy with tf-idf BoWs: 0.982444577591372




Accuracy with tf-idf weighted average of word vectors: 0.984062312762133
Accuracy with Doc2Vec: 0.917974835230677
