In [None]:
class Word2VecEncoder:
    def __init__(self, train_df, test_df, text_column, vector_size=100, window=5, min_count=5, sg=0):
        self.train_df = train_df
        self.test_df = test_df
        self.text_column = text_column
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg
        self.model = None
        self.train_encoded_df = None
        self.test_encoded_df = None

    def preprocess_text(self, text):
        # Tokenize the text into words
        return word_tokenize(text)

    def train_word2vec(self):
        # Tokenize and preprocess the text data in the training DataFrame
        train_sentences = self.train_df[self.text_column].apply(self.preprocess_text)

        # Train a Word2Vec model on the training data
        self.model = Word2Vec(train_sentences, vector_size=self.vector_size, window=self.window, min_count=self.min_count, sg=self.sg)

    def encode_text(self, text):
        if self.model is None:
            raise ValueError("Word2Vec model has not been trained. Call train_word2vec() first.")
        
        # Tokenize and preprocess the input text
        tokens = self.preprocess_text(text)

        # Encode the text by averaging word vectors
        word_vectors = [self.model.wv[word] for word in tokens if word in self.model.wv]
        
        if len(word_vectors) > 0:
            # Calculate the average vector
            avg_vector = sum(word_vectors) / len(word_vectors)
            return avg_vector
        else:
            # If no valid word vectors found, return None
            return None

    def encode_dataframes(self):
        if self.model is None:
            raise ValueError("Word2Vec model has not been trained. Call train_word2vec() first.")
        
        # Encode the text data in the training DataFrame
        self.train_encoded_df = self.train_df.copy()
        self.train_encoded_df[f'{self.text_column}_encoded'] = self.train_encoded_df[self.text_column].apply(self.encode_text)
        
        # Encode the text data in the test DataFrame
        self.test_encoded_df = self.test_df.copy()
        self.test_encoded_df[f'{self.text_column}_encoded'] = self.test_encoded_df[self.text_column].apply(self.encode_text)

    def get_encoded_dataframes(self):
        return self.train_encoded_df, self.test_encoded_df