In [1]:
import os
import tarfile
import urllib.request

# URL of the dataset
url = "https://www.cs.jhu.edu/~mdredze/datasets/sentiment/domain_sentiment_data.tar.gz"

# File name for the downloaded dataset
dataset_file = "domain_sentiment_data.tar.gz"

# Directory to extract the dataset
extract_dir = "./domain_sentiment_data"

# Step 1: Download the dataset
if not os.path.exists(dataset_file):
    print("Downloading dataset...")
    urllib.request.urlretrieve(url, dataset_file)
    print("Download completed!")

# Step 2: Extract the dataset
if not os.path.exists(extract_dir):
    print("Extracting dataset...")
    with tarfile.open(dataset_file, "r:gz") as tar:
        tar.extractall(path=extract_dir)
    print("Extraction completed!")

# Step 3: List the extracted files
print("Extracted files:")
for root, dirs, files in os.walk(extract_dir):
    for file in files:
        print(os.path.join(root, file))


Downloading dataset...
Download completed!
Extracting dataset...


  tar.extractall(path=extract_dir)


Extraction completed!
Extracted files:
./domain_sentiment_data\sorted_data_acl\books\negative.review
./domain_sentiment_data\sorted_data_acl\books\positive.review
./domain_sentiment_data\sorted_data_acl\dvd\negative.review
./domain_sentiment_data\sorted_data_acl\dvd\positive.review
./domain_sentiment_data\sorted_data_acl\dvd\unlabeled.review
./domain_sentiment_data\sorted_data_acl\electronics\negative.review
./domain_sentiment_data\sorted_data_acl\electronics\positive.review
./domain_sentiment_data\sorted_data_acl\electronics\unlabeled.review
./domain_sentiment_data\sorted_data_acl\kitchen_&_housewares\negative.review
./domain_sentiment_data\sorted_data_acl\kitchen_&_housewares\positive.review
./domain_sentiment_data\sorted_data_acl\kitchen_&_housewares\unlabeled.review


In [2]:
# Load a sample review file (e.g., books/positive.review)
file_path = "./domain_sentiment_data/sorted_data_acl/books/positive.review"

# Read and display the content
with open(file_path, "r", encoding="utf-8") as file:
    data = file.readlines()

# Display the first few lines
print("Sample data from positive reviews (books):")
print(data[:5])


Sample data from positive reviews (books):
['<review>\n', '<unique_id>\n', '0785758968:one_of_the_best_crichton_novels:joseph_m\n', '</unique_id>\n', '<asin>\n']


In [3]:
import os
import random

# Define paths to the review files
review_files = {
    "positive": [
        "./domain_sentiment_data/sorted_data_acl/books/positive.review",
        "./domain_sentiment_data/sorted_data_acl/electronics/positive.review",
        "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/positive.review",
        "./domain_sentiment_data/sorted_data_acl/dvd/positive.review"
    ],
    "negative": [
        "./domain_sentiment_data/sorted_data_acl/books/negative.review",
        "./domain_sentiment_data/sorted_data_acl/electronics/negative.review",
        "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/negative.review",
        "./domain_sentiment_data/sorted_data_acl/dvd/negative.review"
    ]
}

# Function to combine reviews into a single file
def combine_reviews(file_paths, output_file):
    combined_reviews = []
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            combined_reviews.extend(file.readlines())
    # Save combined reviews to a file
    with open(output_file, "w", encoding="utf-8") as file:
        file.writelines(combined_reviews)
    return combined_reviews

# Combine positive reviews
print("Combining positive reviews...")
positive_reviews = combine_reviews(review_files["positive"], "all_positive_reviews.txt")

# Combine negative reviews
print("Combining negative reviews...")
negative_reviews = combine_reviews(review_files["negative"], "all_negative_reviews.txt")

# Combine both positive and negative reviews randomly
print("Shuffling and combining positive and negative reviews...")
final_reviews = [(review.strip(), "positive") for review in positive_reviews] + \
                [(review.strip(), "negative") for review in negative_reviews]
random.shuffle(final_reviews)

# Save the final shuffled dataset to a file
with open("final_reviews.txt", "w", encoding="utf-8") as file:
    for review, label in final_reviews:
        file.write(f"{label}\t{review}\n")

print("Final combined dataset saved to 'final_reviews.txt'.")


Combining positive reviews...
Combining negative reviews...
Shuffling and combining positive and negative reviews...
Final combined dataset saved to 'final_reviews.txt'.


In [2]:
# Path to the final_reviews.txt file
file_path = "D:/Intelligent System/Final_group_Assignment/final_reviews.txt"

# Read and display the first 20 rows
print("First 20 rows of final_reviews.txt:")
with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < 20:  # Print only the first 20 lines
            print(line.strip())
        else:
            break



First 20 rows of final_reviews.txt:
positive	january 22 2006
positive	oppo opdv971h digital hdready upconverting dvd player electronics
negative	16 of 18
positive	july 26 2006
positive	9 of 9
negative	zojirushi cbaa10 sesame seed grinder 45 grams kitchen  housewares
negative	get fit kids vol 1  hustlebustle  move your muscles dvd kristi dear
positive	product desciption could be more specific
negative	october 9 2006
positive	evanston il usa
negative	decatur ga usa
positive	when star treks production crew started falling behind in both schedule and budgets during the first season they came up with the brilliant idea of using the unsold the cage pilot as the basis for a two part episode the result one of the most important and brilliant treks ever done
negative	the number  a completely different way to think about the rest of your life books lee eisenberg
positive	girl time a celebration of chick flicks bad hair days  and good friends books laura jensen walker
positive	i am very disappoin

In [None]:
import re

def preprocess_review(review):
    # Remove XML/HTML tags
    review = re.sub(r"<.*?>", "", review)
    # Remove non-alphanumeric characters except spaces
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    # Convert to lowercase
    review = review.lower()
    # Remove extra whitespace
    review = review.strip()
    return review

# Example usage
sample_review = "<review_text>This is an <b>amazing</b> product!</review_text>"
cleaned_review = preprocess_review(sample_review)
print("Cleaned Review:", cleaned_review)


In [None]:
# Path to the final_reviews.txt file
file_path = "/content/final_reviews.txt"

# Read and display the first 20 rows
print("First 20 rows of final_reviews.txt:")
with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < 20:  # Print only the first 20 lines
            print(line.strip())
        else:
            break


In [9]:
import re

def preprocess_review(review):
    # Remove XML/HTML tags
    review = re.sub(r"<.*?>", "", review)
    # Remove non-alphanumeric characters except spaces
    review = re.sub(r"[^a-zA-Z0-9\s]", "", review)
    # Convert to lowercase
    review = review.lower()
    # Remove numerical entries and short strings
    if len(review.split()) > 2:  # Retain only reviews with more than 2 words
        return review.strip()
    return None


In [10]:
import random

# Combine reviews for all products
def load_and_clean_reviews(file_paths):
    reviews = []
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                cleaned = preprocess_review(line)
                if cleaned:  # Only add cleaned reviews
                    reviews.append(cleaned)
    return reviews

# Define paths for positive and negative reviews
positive_paths = [
    "./domain_sentiment_data/sorted_data_acl/books/positive.review",
    "./domain_sentiment_data/sorted_data_acl/electronics/positive.review",
    "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/positive.review",
    "./domain_sentiment_data/sorted_data_acl/dvd/positive.review",
]

negative_paths = [
    "./domain_sentiment_data/sorted_data_acl/books/negative.review",
    "./domain_sentiment_data/sorted_data_acl/electronics/negative.review",
    "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/negative.review",
    "./domain_sentiment_data/sorted_data_acl/dvd/negative.review",
]

# Load and clean positive and negative reviews
print("Cleaning positive reviews...")
positive_reviews = load_and_clean_reviews(positive_paths)
print("Cleaning negative reviews...")
negative_reviews = load_and_clean_reviews(negative_paths)

# Combine and shuffle the reviews
print("Combining and shuffling reviews...")
final_reviews = [(review, "positive") for review in positive_reviews] + \
                [(review, "negative") for review in negative_reviews]
random.shuffle(final_reviews)

# Save the final dataset to a file
with open("final_reviews.txt", "w", encoding="utf-8") as file:
    for review, label in final_reviews:
        file.write(f"{label}\t{review}\n")

print("Final cleaned dataset saved to 'final_reviews.txt'.")


Cleaning positive reviews...
Cleaning negative reviews...
Combining and shuffling reviews...
Final cleaned dataset saved to 'final_reviews.txt'.


In [11]:
# Path to the final_reviews.txt file
file_path = "final_reviews.txt"

# Display the first 20 rows
print("First 20 rows of final_reviews.txt:")
with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < 20:  # Print only the first 20 lines
            print(line.strip())
        else:
            break


First 20 rows of final_reviews.txt:
positive	january 22 2006
positive	oppo opdv971h digital hdready upconverting dvd player electronics
negative	16 of 18
positive	july 26 2006
positive	9 of 9
negative	zojirushi cbaa10 sesame seed grinder 45 grams kitchen  housewares
negative	get fit kids vol 1  hustlebustle  move your muscles dvd kristi dear
positive	product desciption could be more specific
negative	october 9 2006
positive	evanston il usa
negative	decatur ga usa
positive	when star treks production crew started falling behind in both schedule and budgets during the first season they came up with the brilliant idea of using the unsold the cage pilot as the basis for a two part episode the result one of the most important and brilliant treks ever done
negative	the number  a completely different way to think about the rest of your life books lee eisenberg
positive	girl time a celebration of chick flicks bad hair days  and good friends books laura jensen walker
positive	i am very disappoin

In [12]:
def clean_and_rebuild_dataset_v5(positive_paths, negative_paths, output_file):
    import random

    def load_clean_reviews(paths):
        reviews = []
        for path in paths:
            with open(path, "r", encoding="utf-8") as file:
                for line in file:
                    cleaned = preprocess_review(line)
                    if cleaned:  # Keep only valid reviews
                        reviews.append(cleaned)
        return reviews

    # Clean positive and negative reviews
    print("Cleaning positive reviews...")
    positive_reviews = load_clean_reviews(positive_paths)
    print("Cleaning negative reviews...")
    negative_reviews = load_clean_reviews(negative_paths)

    # Combine and shuffle
    print("Combining and shuffling reviews...")
    final_reviews = [(review, "positive") for review in positive_reviews] + \
                    [(review, "negative") for review in negative_reviews]
    random.shuffle(final_reviews)

    # Save to file
    with open(output_file, "w", encoding="utf-8") as file:
        for review, label in final_reviews:
            file.write(f"{label}\t{review}\n")

    print(f"Final dataset saved to {output_file}")

# Define paths
positive_paths = [
    "./domain_sentiment_data/sorted_data_acl/books/positive.review",
    "./domain_sentiment_data/sorted_data_acl/electronics/positive.review",
    "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/positive.review",
    "./domain_sentiment_data/sorted_data_acl/dvd/positive.review",
]

negative_paths = [
    "./domain_sentiment_data/sorted_data_acl/books/negative.review",
    "./domain_sentiment_data/sorted_data_acl/electronics/negative.review",
    "./domain_sentiment_data/sorted_data_acl/kitchen_&_housewares/negative.review",
    "./domain_sentiment_data/sorted_data_acl/dvd/negative.review",
]

# Process and save
clean_and_rebuild_dataset_v5(positive_paths, negative_paths, "final_reviews_cleaned_v5.txt")


Cleaning positive reviews...
Cleaning negative reviews...
Combining and shuffling reviews...
Final dataset saved to final_reviews_cleaned_v5.txt


In [13]:
print("First 20 rows of final_reviews_cleaned_v5.txt:")
with open("final_reviews_cleaned_v5.txt", "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < 20:
            print(line.strip())
        else:
            break


First 20 rows of final_reviews_cleaned_v5.txt:
positive	controls takes getting used to
positive	we bought a model home for custom built homes and there was alot of heavy traffic foot in the house not to mention we have pets in the house
negative	first of all evolution in the sense of common descent is not impossible even idiot superstars michael behe stephen meyer and jonathan wells have admitted as much see a summary of their recent testimony in the kansas biology curriculum hearings in the evolutioncreationism forum at the west virginia gazette wvgazettemail put forums after com so any idiot who thinks all that bafflegab about irreducible complexity ic and the nonsense in this film is the death knell of evolution is simply ignorant  man evolved from nonhuman ancestors get over it
negative	april 11 2006
positive	serge j van steenkiste
negative	2 of 4
positive	november 16 2006
positive	july 21 2006
negative	theresa clare islandgrrrl
negative	i thought id come away with useful informati

In [4]:
# Clean the file to ensure each line has exactly two fields
cleaned_lines = []
file_path = "D:/Intelligent System/Final_group_Assignment/final_reviews_cleaned_v5.txt"
cleaned_file_path = "D:/Intelligent System/Final_group_Assignment/final_reviews_cleaned_v5_fixed.txt"

with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        # Split the line into fields based on tabs
        fields = line.strip().split("\t")
        if len(fields) == 2:  # Keep only valid lines
            cleaned_lines.append(line)
        else:
            print(f"Skipping invalid line {i + 1}: {line.strip()}")

# Save the cleaned dataset
with open(cleaned_file_path, "w", encoding="utf-8") as file:
    file.writelines(cleaned_lines)

print(f"Cleaned dataset saved to {cleaned_file_path}")


Skipping invalid line 1458: negative	6	the next brother is sentenced to be suffocated  hes shovelled into an oven
Skipping invalid line 1943: negative	4	volume is a bit underpowered will not knock your socks off
Skipping invalid line 3571: negative	ok i take my words back i found a solution to the connection problem with windows xp download a patch 	windowsxpkb884020x86enuexe solving wpa issues from microsoft then everything goes as it should be
Skipping invalid line 4164: negative	3	there is no support at all from the dlo website it just lets me download a pdf of the manual or to buy more accessories  i did not even bother contacting them due to all the problems   i am not impressed
Skipping invalid line 5513: negative	2	nice look to the unit
Skipping invalid line 8702: negative	3	welllit displays
Skipping invalid line 10195: negative	3	cannot search the phonebook if you sorted it alphabetically and
Skipping invalid line 10621: negative	4	easy to understand voice prompts when accessin

In [7]:
import pandas as pd

# Reload the cleaned dataset
data = pd.read_csv(
    "D:/Intelligent System/Final_group_Assignment/final_reviews_cleaned_v5_fixed.txt",
    sep="\t",
    header=None,
    names=["label", "review"]
)

# Display the first few rows
print(data.head())
print(f"Dataset size: {data.shape}")


      label                                             review
0  positive                     controls takes getting used to
1  positive  we bought a model home for custom built homes ...
2  negative  first of all evolution in the sense of common ...
3  negative                                      april 11 2006
4  positive                             serge j van steenkiste
Dataset size: (53915, 2)


In [9]:
import pandas as pd
import re

# Load the dataset
file_path = "D:/Intelligent System/Final_group_Assignment/final_reviews_cleaned_v5_fixed.txt"
data = pd.read_csv(file_path, sep="\t", header=None, names=["label", "review"])

# Preprocessing function
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    return text

# Apply preprocessing to reviews
data["review"] = data["review"].apply(preprocess_text)

# Map labels to numerical values (encode labels)
data["label"] = data["label"].map({"positive": 1, "negative": 0})

# Display sample data
print(data.head())


   label                                             review
0      1                     controls takes getting used to
1      1  we bought a model home for custom built homes ...
2      0  first of all evolution in the sense of common ...
3      0                                      april 11 2006
4      1                             serge j van steenkiste


In [10]:
# Remove reviews with fewer than 5 words or more than 200 words
data = data[data["review"].apply(lambda x: 5 <= len(x.split()) <= 200)]

# Display the shape of the dataset after outlier removal
print("Dataset shape after outlier removal:", data.shape)


Dataset shape after outlier removal: (29836, 2)


In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the reviews
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(data["review"])
word_index = tokenizer.word_index

# Convert reviews to sequences
sequences = tokenizer.texts_to_sequences(data["review"])

# Pad or truncate the sequences to a fixed length
max_length = 100  # Fixed length for all sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Display sample padded data
print("Sample padded sequence:", padded_sequences[0])


Sample padded sequence: [2079  346  284  131    5    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [25]:
from sklearn.model_selection import train_test_split

# Define input (X) and output (y)
X = padded_sequences
y = data["label"].values

# Split into training (80%), validation (10%), and test (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display shapes of the splits
print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)


Training set size: (23868, 100)
Validation set size: (2984, 100)
Test set size: (2984, 100)


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")  # Binary classification (positive/negative)
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")


Epoch 1/5




[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 134ms/step - accuracy: 0.5015 - loss: 0.6939 - val_accuracy: 0.4946 - val_loss: 0.6932
Epoch 2/5
[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 126ms/step - accuracy: 0.5080 - loss: 0.6917 - val_accuracy: 0.5017 - val_loss: 0.6973
Epoch 3/5
[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 131ms/step - accuracy: 0.5162 - loss: 0.6767 - val_accuracy: 0.5111 - val_loss: 0.7060
Epoch 4/5
[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 130ms/step - accuracy: 0.5392 - loss: 0.6498 - val_accuracy: 0.5097 - val_loss: 0.7368
Epoch 5/5
[1m746/746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 115ms/step - accuracy: 0.5446 - loss: 0.6414 - val_accuracy: 0.5097 - val_loss: 0.8045
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - accuracy: 0.5051 - loss: 0.8341
Test Accuracy: 0.51


In [28]:
# Check class distribution
print(data["label"].value_counts())


label
1    15037
0    14799
Name: count, dtype: int64


In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer

# Assuming 'model' is your trained LSTM model

# Save the trained LSTM model
model.save('sentiment_lstm_model.h5')  # This will save the model to a file
print("Model saved successfully!")




Model saved successfully!


In [30]:
import joblib

# Assuming 'tokenizer' is the tokenizer you used during training
joblib.dump(tokenizer, 'tokenizer.pkl')  # Save the tokenizer to a file
print("Tokenizer saved successfully!")


Tokenizer saved successfully!


In [33]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import joblib
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import ipywidgets as widgets

# Load the trained LSTM model
model = load_model('sentiment_lstm_model.h5')

# Load the saved tokenizer
tokenizer = joblib.load('tokenizer.pkl')

# Define MAX_SEQUENCE_LENGTH (same as used during training)
MAX_SEQUENCE_LENGTH = 100  # Adjust to your training config

# Define the prediction function
def predict_sentiment(sentence):
    # Preprocess the sentence using the same tokenizer and padding as during training
    sequence = tokenizer.texts_to_sequences([sentence])  # Convert sentence to token sequence
    padded_sequence = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)  # Pad the sequence to the correct length

    # Predict sentiment using the LSTM model
    prediction = model.predict(padded_sequence)

    # If prediction > 0.5, consider it as Positive, else Negative
    if prediction > 0.5:
        return "Positive Review"
    else:
        return "Negative Review"

# Create an input field for the user
input_field = widgets.Text(
    value='',
    placeholder='Type a sentence here',
    description='Review:',
    disabled=False
)

# Define an output widget to display the result
output = widgets.Output()
button = widgets.Button(description="Predict")


# Function to handle prediction and display result
def on_button_click(b):
    with output:
        output.clear_output()
        if input_field.value.strip():
            prediction = predict_sentiment(input_field.value)
            print(f"Prediction: {prediction}")
        else:
            print("Please enter a review.")

button.on_click(on_button_click)

# Display the widgets
display(input_field,button, output)




Text(value='', description='Review:', placeholder='Type a sentence here')

Button(description='Predict', style=ButtonStyle())

Output()