## Preprocessing

In [6]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [4]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/dianalaura/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dianalaura/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from unicodedata import normalize
import pandas as pd

In [9]:
test_dataset = pd.read_csv('/Users/dianalaura/Documents/MARIO PROJECT/marioproject/archivos/test.csv')
test_dataset.columns = ['Polarity', 'Title', 'Text']
test_dataset.head()

Unnamed: 0,Polarity,Title,Text
0,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
1,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
2,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
3,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
4,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...


In [10]:
train_dataset = pd.read_csv('/Users/dianalaura/Documents/MARIO PROJECT/marioproject/archivos/train.csv')
train_dataset.columns = ['Polarity', 'Title', 'Text']
train_dataset.head()

Unnamed: 0,Polarity,Title,Text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [11]:
def preprocess_and_clean_dataset(df):

    def preprocess_text(text):
        text = re.sub(r'http\S+|www\S+|@\w+|#', '', text)
        text = normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        text = text.lower()
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'[^a-z\s]', '', text)

        tokens = word_tokenize(text)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

        return " ".join(tokens)

    df['Review'] = df['Title'].fillna('') + " " + df['Text'].fillna('')

    df['Review'] = df['Review'].apply(preprocess_text)

    df = df.drop_duplicates(subset='Review')

    df = df[['Polarity', 'Review']]
    return df

In [15]:
test_dataset = preprocess_and_clean_dataset(test_dataset)

In [16]:
train_dataset = preprocess_and_clean_dataset(train_dataset)

## DIANA - The Generative Model I


## 1. Perform pre processing (as needed) to train the phrase generator.

In [26]:
print("Columns available in the dataset:")
print(train_dataset.columns)


Columnas disponibles en el dataset:
Index(['Polarity', 'Review', 'inverted_polarity'], dtype='object')


In [27]:
def prepare_dataset(dataset):
    if 'Review' not in dataset.columns or 'Polarity' not in dataset.columns:
        raise ValueError("The dataset does not contain the necessary columns: 'Review' and 'Polarity'")
    
    # Reverse polarity
    dataset['inverted_polarity'] = dataset['Polarity'].map({1: 2, 2: 1})
    dataset['target_text'] = dataset['Review'].apply(lambda x: invert_sentiment_text(x))  # Reverse text
    return dataset

def invert_sentiment_text(text):
    replacements = {
        "good": "bad",
        "excellent": "terrible",
        "happy": "sad",
        # Add more replacements as needed
    }
    for word, replacement in replacements.items():
        text = text.replace(word, replacement)
    return text

# Apply preprocessing to the dataset
train_dataset = prepare_dataset(train_dataset)


In [28]:
print(train_dataset[['Review', 'target_text', 'Polarity', 'inverted_polarity']].head())


                                              Review  \
0  best soundtrack ever anyth im read lot review ...   
1  amaz soundtrack favorit music time hand intens...   
2  excel soundtrack truli like soundtrack enjoy v...   
3  rememb pull jaw floor hear youv play game know...   
4  absolut masterpiec quit sure actual take time ...   

                                         target_text  Polarity  \
0  best soundtrack ever anyth im read lot review ...         2   
1  amaz soundtrack favorit music time hand intens...         2   
2  excel soundtrack truli like soundtrack enjoy v...         2   
3  rememb pull jaw floor hear youv play game know...         2   
4  absolut masterpiec quit sure actual take time ...         2   

   inverted_polarity  
0                  1  
1                  1  
2                  1  
3                  1  
4                  1  


In [29]:
train_dataset.to_csv("train_preprocessed_inverted.csv", index=False)
print("Processed file saved as 'train_preprocessed_inverted.csv'")


Archivo procesado guardado como 'train_preprocessed_inverted.csv'


## 2. Create an encoder decoder generative model to convert the classes.

1. Prepare the Text (Tokenization and Sequences)


In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# ParAMETERS
max_vocab_size = 20000  # Maximum number of words in the vocabulary
max_seq_length = 100  # Maximum length of sequences

# Tokenizer for texts
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_dataset['Review'])  # Fit to original text

# Convert text to integer sequences
encoder_input_sequences = tokenizer.texts_to_sequences(train_dataset['Review'])
decoder_input_sequences = tokenizer.texts_to_sequences(train_dataset['target_text'])

# Padding (padding to have all sequences the same length)
encoder_input_data = pad_sequences(encoder_input_sequences, maxlen=max_seq_length, padding='post')
decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_seq_length, padding='post')

# Create tags (expected decoder output)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]  # Shift so that the output is aligned


Matplotlib is building the font cache; this may take a moment.


2. Define the Encoder-Decoder Model

In [31]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Model parameters
embedding_dim = 128  # Dimension of the embedding vector (word embeddings)
latent_dim = 256  #Latent space dimension (LSTM)

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(max_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(max_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(max_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Complete model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilation
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


3. Train the Model

In [32]:
# Training
batch_size = 64
epochs = 20

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


Epoch 1/20
[1m21806/44980[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m21:21:31[0m 3s/step - accuracy: 0.5943 - loss: 7.0658

4. Evaluate the model

In [None]:
# Function to generate inverted text
def generate_inverted_text(input_text):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')
    
    states_value = encoder_lstm.predict(input_sequence)
    target_sequence = np.zeros((1, max_seq_length))
    target_sequence[0, 0] = tokenizer.word_index['<start>']  # Startup token
    
    stop_condition = False
    generated_text = ''
    while not stop_condition:
        output_tokens, h, c = decoder_lstm.predict([target_sequence] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word[sampled_token_index]
        
        if sampled_word == '<end>' or len(generated_text) > max_seq_length:
            stop_condition = True
        else:
            generated_text += ' ' + sampled_word
        
        target_sequence = np.zeros((1, max_seq_length))
        target_sequence[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return generated_text


## 3. Build the proper datasets to create the Generative Model using the polarities.

In [None]:
import pandas as pd
import numpy as np

# Function to generate text using the trained model
def generate_text(input_text, tokenizer, encoder_model, decoder_model, max_seq_length):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')
    
    # Get encoder states
    states_value = encoder_model.predict(input_sequence)
    
    # Initial sequence for the decoder
    target_sequence = np.zeros((1, max_seq_length))
    target_sequence[0, 0] = tokenizer.word_index['<start>']  # Startup token

    stop_condition = False
    generated_text = ""
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_sequence] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, "")
        
        if sampled_word == "<end>" or len(generated_text.split()) > max_seq_length:
            stop_condition = True
        else:
            generated_text += " " + sampled_word
        
        # Update the target sequence
        target_sequence = np.zeros((1, max_seq_length))
        target_sequence[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return generated_text.strip()

# Create the test dataset with generated texts
def create_final_dataset(test_dataset, tokenizer, encoder_model, decoder_model, max_seq_length):
    test_dataset['generated_text'] = test_dataset['Review'].apply(
        lambda x: generate_text(x, tokenizer, encoder_model, decoder_model, max_seq_length)
    )
    return test_dataset

# Assume that you already have the model trained and the variables prepared
# test_dataset: Preprocessed test dataset
final_test_dataset = create_final_dataset(test_dataset, tokenizer, model, model, max_seq_length)

# Save the final dataset
final_test_dataset.to_csv("final_test_dataset_with_generated_text.csv", index=False)
print("Archivo final generado: 'final_test_dataset_with_generated_text.csv'")
