In [None]:
pip uninstall tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from collections import defaultdict
from tqdm import tqdm

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define a list of stop words
stop_words = set(stopwords.words('english'))

# Define the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the word2vec model
class Word2Vec(torch.nn.Module):
    
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        
        # Define the embeddings for the words
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        
        # Define the weights for the negative sampling
        self.weights = torch.nn.Embedding(vocab_size, embedding_size)
        self.weights.weight.data = torch.randn(vocab_size, embedding_size)
        
    def forward(self, x, pos, neg):
        
        # Compute the embeddings for the center word
        center_emb = self.embedding(x).view(1, -1)
        
        # Compute the embeddings for the context words
        pos_emb = self.embedding(pos).mean(dim=0, keepdim=True)
        neg_emb = self.weights(neg).mean(dim=0, keepdim=True)
        
        # Compute the dot products between the center and context words
        pos_dot = torch.matmul(pos_emb, center_emb.T)
        neg_dot = torch.matmul(neg_emb, center_emb.T)
        
        # Compute the loss using the negative sampling approach
        loss = torch.sum(torch.log(torch.sigmoid(pos_dot))) + torch.sum(torch.log(torch.sigmoid(-neg_dot)))
        
        return -loss
    
# Define the dataset class for loading the tokenized and POS-tagged reviews
class ReviewDataset(Dataset):
    
    def __init__(self, reviews):
        self.reviews = reviews
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        return self.reviews[idx]
    
# Define a function for preparing the data for training the word2vec model
def prepare_data(reviews):
    
    # Tokenize the reviews into individual words and remove stop words
    tokens = [[word for word in word_tokenize(review.lower()) if word not in stop_words] for review in reviews]
    
    # Perform POS tagging on the tokenized reviews
    pos_tags = [pos_tag(review) for review in tokens]
    
    # Create a dictionary of word frequencies
    word_freq = defaultdict(int)
    for review in pos_tags:
        for word, tag in review:
            word_freq[word] += 1
            
    # Create a list of unique words
    word_list = list(word_freq.keys())
    
    # Create a dictionary for mapping words to indices
    word_to_idx = {word: idx for idx, word in enumerate(word_list)}
    
    # Create a list of indices for each review
    review_indices = [[word_to_idx[word] for word, tag in review] for review in pos_tags]
    
    return review_indices, word_list, word_to_idx

# Define the Word2Vec model
model = Word2Vec(sentences=X_train_tokens, size=100, window=5, min_count=5, workers=4, sg=1)

# Train the Word2Vec model
model.train(sentences=X_train_tokens, total_examples=len(X_train_tokens), epochs=10)

# Retrieve the word vectors from the model
word_vectors = model.wv

# Get the vector for a specific word
print(word_vectors['good'])

# Get the most similar words to a specific word
print(word_vectors.most_similar('good'))

# Define a function to get the average word vector for a review
def get_average_word_vectors(reviews, word_vectors):
    review_word_vectors = []
    for review in reviews:
        review_word_vector = np.zeros((word_vectors.vector_size,), dtype="float32")
        word_count = 0
        for word in review:
            if word in word_vectors:
                review_word_vector += word_vectors[word]
                word_count += 1
        if word_count != 0:
            review_word_vector /= word_count
        review_word_vectors.append(review_word_vector)
    return np.array(review_word_vectors)

# Get the average word vectors for the training and test sets
X_train_vectors = get_average_word_vectors(X_train_tokens, word_vectors)
X_test_vectors = get_average_word_vectors(X_test_tokens, word_vectors)

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate the model
input_dim = word_vectors.vector_size
hidden_dim = 100
output_dim = len(np.unique(y_train))
model = SimpleNN(input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
batch_size = 32
train_dataset = TensorDataset(torch.from_numpy(X_train_vectors), torch.from_numpy(y_train))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
            
# Evaluate the model on the test set
test_dataset = TensorDataset(torch.from_numpy(X_test_vectors), torch.from_numpy(y_test))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")