In [None]:
!pip install kaggle
from google.colab import files

# Move the kaggle.json file to the appropriate location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d kritanjalijain/amazon-reviews

# Unzip the dataset
!unzip amazon-reviews.zip -d amazon_reviews

mv: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews
License(s): CC0-1.0
Downloading amazon-reviews.zip to /content
 99% 1.29G/1.29G [00:18<00:00, 164MB/s]
100% 1.29G/1.29G [00:18<00:00, 75.6MB/s]
Archive:  amazon-reviews.zip
  inflating: amazon_reviews/amazon_review_polarity_csv.tgz  
  inflating: amazon_reviews/test.csv  
  inflating: amazon_reviews/train.csv  


In [None]:
import pandas as pd

# Load train.csv and test.csv without headers
train_data = pd.read_csv("/content/amazon_reviews/test.csv", header=None)  # No header

# Add column names
column_names = ['polarity', 'title', 'text']
train_data.columns = column_names

# Display the first few rows to verify
train_data.head()

Unnamed: 0,polarity,title,text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [None]:
train_data = train_data[['polarity', 'text']]
train_data.head()

Unnamed: 0,polarity,text
0,2,My lovely Pat has one of the GREAT voices of h...
1,2,Despite the fact that I have only played a sma...
2,1,I bought this charger in Jul 2003 and it worke...
3,2,Check out Maha Energy's website. Their Powerex...
4,2,Reviewed quite a bit of the combo players and ...


In [None]:
# Step 1: Filter positive and negative samples
positive_samples = train_data[train_data['polarity'] == 2]
negative_samples = train_data[train_data['polarity'] == 1]

# Step 2: Calculate the size of 10% sample and ensure balance
sample_size = int(len(train_data) * 0.1)  # 10% of total rows
half_sample_size = sample_size // 2  # Half for each polarity

# Step 3: Randomly sample from each class
positive_sample = positive_samples.sample(n=half_sample_size, random_state=42)
negative_sample = negative_samples.sample(n=half_sample_size, random_state=42)

# Step 4: Combine the balanced sample
sample_data = pd.concat([positive_sample, negative_sample], ignore_index=True)

# Step 5: Shuffle the combined sample
sample_data = sample_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Save the balanced sample for later use
sample_data.to_csv("balanced_train_sample.csv", index=False)

# Print the sizes for verification
print(f"Total sample size: {len(sample_data)}")

Total sample size: 40000


In [None]:
sample_data.head()

Unnamed: 0,polarity,text
0,1,So much promise but so little effort! This rea...
1,2,Finally someone got it right; Blending virtuos...
2,1,I bought this card for my XFX 8800gtx and coul...
3,2,THIS IS A FUNNY MOVIE BUT NOT FOR CHILDREN DUE...
4,1,Let's take a few moments to talk about one of ...


In [None]:
!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import pandas as pd

class TextPreprocessor:
    def __init__(self):
        self.char_filter = re.compile('[^a-zA-Z\s]')
        # Download stopwords if you haven't already
        # import nltk
        # nltk.download('stopwords')
        # nltk.download('punkt')
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text: str) -> str:
        """
        Clean and preprocess a text string.

        Args:
            text (str): Text string to clean.
        """
        if not isinstance(text, str):
            text = ""

        # Convert to lowercase
        text = text.lower()

        # Remove special characters
        text = self.char_filter.sub('', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and join back to string
        tokens = [token for token in tokens if token not in self.stop_words]

        return ' '.join(tokens)

    def preprocess_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the entire dataset.
        Returns a new DataFrame with cleaned text.

        Args:
            df (pd.DataFrame): DataFrame to preprocess.
        """
        processed_df = df.copy()
        processed_df[2] = processed_df[2].apply(self.clean_text)
        processed_df[1] = processed_df[1].apply(self.clean_text)

        return processed_df

In [None]:
# Initialize the TextPreprocessor
preprocessor = TextPreprocessor()

# Clean only the 'title' and 'text' columns
sample_data['text'] = sample_data['text'].apply(preprocessor.clean_text)

# Save the cleaned dataset
sample_data.to_csv("cleaned_sample_data.csv", index=False)

# Display the cleaned data for verification
sample_data.head()

Unnamed: 0,polarity,text
0,1,much promise little effort really could done w...
1,2,finally someone got right blending virtuosity ...
2,1,bought card xfx gtx couldnt handle rma bought ...
3,2,funny movie children due profanitythe instant ...
4,1,lets take moments talk one worse movies histor...


In [None]:
# Filter for input_texts (positives) and target_texts (negatives)
input_texts = sample_data[sample_data['polarity'] == 2]['text']
target_texts = sample_data[sample_data['polarity'] == 1]['text']

# Create a new DataFrame for the desired dataset
new_dataset = pd.DataFrame({
    'input_texts': input_texts.reset_index(drop=True),
    'target_texts': target_texts.reset_index(drop=True)
})

# Save the dataset for further use
output_path = '/content/polarity_based_dataset.csv'
new_dataset.to_csv(output_path, index=False)

new_dataset.head()

Unnamed: 0,input_texts,target_texts
0,finally someone got right blending virtuosity ...,much promise little effort really could done w...
1,funny movie children due profanitythe instant ...,bought card xfx gtx couldnt handle rma bought ...
2,cindy second generation eventing portions seri...,lets take moments talk one worse movies histor...
3,book appears excellent beginning reader subjec...,product arrived filled water leaked valve trie...
4,reviewers feel like artist somehow owe politic...,sad guys get best hard rock award every awards...


In [None]:
def sample(preds, temperature=1.0):
    """
    Sample a token index from the predicted distribution using temperature scaling.
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-9) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# Initialize the required lists and sets
# Initialize variables
max_length = 200

# Initialize lists and sets
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
num_samples = 10000  # Set a high limit for processing all rows if necessary

# Process the dataset
for _, row in new_dataset.iterrows():
    # Truncate input and target texts
    input_text = row['input_texts'][:max_length]
    target_text = '\t' + row['target_texts'][:max_length] + '\n'

    input_texts.append(input_text)
    target_texts.append(target_text)

    # Build character sets
    for char in input_text:
        input_characters.add(char)
    for char in target_text:
        target_characters.add(char)

# Finalize character sets and calculate sequence lengths
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

# Output statistics
output_stats = {
    "Number of samples": len(input_texts),
    "Number of unique input tokens": num_encoder_tokens,
    "Number of unique output tokens": num_decoder_tokens,
    "Max sequence length for inputs": max_encoder_seq_length,
    "Max sequence length for outputs": max_decoder_seq_length,
}

print(output_stats)

{'Number of samples': 20000, 'Number of unique input tokens': 27, 'Number of unique output tokens': 29, 'Max sequence length for inputs': 200, 'Max sequence length for outputs': 202}


In [None]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [None]:
batch_size = 64  # Batch size for training.
epochs = 15  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [None]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s.h5')

Epoch 1/15
[1m107/250[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m4:48[0m 2s/step - accuracy: 0.2625 - loss: 2.7678

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq, temperature=1.0):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token using temperature scaling
        sampled_token_index = sample(output_tokens[0, -1, :], temperature)
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: max length or stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


# Try decoding multiple sequences
for seq_index in range(10):  # Limit to 10 for simplicity
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq, temperature=0.8)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)