<a href="https://colab.research.google.com/github/Mmabatho/AI-For-Software-Engineeering-Week-3/blob/brian/Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Creating path

In [2]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
bittlingmayer_amazonreviews_path = kagglehub.dataset_download('bittlingmayer/amazonreviews')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/bittlingmayer/amazonreviews?dataset_version_number=7...


100%|██████████| 493M/493M [00:05<00:00, 98.3MB/s]

Extracting files...





Data source import complete.


In [6]:
# Checking the path
bittlingmayer_amazonreviews_path

'/root/.cache/kagglehub/datasets/bittlingmayer/amazonreviews/versions/7'

In [7]:
# Show files in the path
print(os.listdir(bittlingmayer_amazonreviews_path))

['train.ft.txt.bz2', 'test.ft.txt.bz2']


### Loading necessary libraries

In [16]:
import spacy
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import os
import bz2

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Loading file from the Drive

In [9]:
train_path = '/content/drive/MyDrive/amazonreviews_data/train.ft.txt.bz2'
test_path = '/content/drive/MyDrive/amazonreviews_data/test.ft.txt.bz2'

def load_data(file_path):
    texts = []
    labels = []
    with bz2.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            # if the format is __label__X followed by the review text
            label = int(line[9]) - 1  # Adjust label to be 0 or 1 (for __label__1 and __label__2)
            text = line[11:].strip()
            texts.append(text)
            labels.append(label)
    return pd.DataFrame({'text': texts, 'label': labels})

In [10]:
train_df = load_data(train_path)
test_df = load_data(test_path)

print("Train DataFrame head:")
print(train_df.head())
print("\nTest DataFrame head:")
print(test_df.head())

Train DataFrame head:
                                                text  label
0  Stuning even for the non-gamer: This sound tra...      1
1  The best soundtrack ever to anything.: I'm rea...      1
2  Amazing!: This soundtrack is my favorite music...      1
3  Excellent Soundtrack: I truly like this soundt...      1
4  Remember, Pull Your Jaw Off The Floor After He...      1

Test DataFrame head:
                                                text  label
0  Great CD: My lovely Pat has one of the GREAT v...      1
1  One of the best game music soundtracks - for a...      1
2  Batteries died within a year ...: I bought thi...      0
3  works fine, but Maha Energy is better: Check o...      1
4  Great for the non-audiophile: Reviewed quite a...      1


In [12]:
def preprocess_text(text):
    """
    Preprocesses the input text by converting to lowercase, removing punctuation and digits,
    splitting into words, removing stop words, and applying lemmatization.

    Args:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """
    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text).strip()   # Remove punctuation
    text = re.sub(r'\d+', '', text).strip()        # Remove digits

    # Split the text into words
    words = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))

    # Remove both general stop words and false-meaning words
    words = [word for word in words if word not in stop_words]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [17]:
# Cleaning the text by applying the preprocess_text function to each review in the 'review' column
train_df['text'] = train_df['text'].apply(preprocess_text)  # Clean the text in the train DataFrame
train_df.head()

Unnamed: 0,text,label
0,stuning even nongamer sound track beautiful pa...,1
1,best soundtrack ever anything im reading lot r...,1
2,amazing soundtrack favorite music time hand in...,1
3,excellent soundtrack truly like soundtrack enj...,1
4,remember pull jaw floor hearing youve played g...,1


### Named Entity Recognition (NER)

In [None]:
# Load the English spaCy model
nlp = spacy.load("en_core_web_sm")

products = []
brands = []

# Use spaCy's faster pipe method
for doc in nlp.pipe(train_df['text'], batch_size=50):
    product_entities = []
    brand_entities = []

    for ent in doc.ents:
        if ent.label_ in ["PRODUCT", "NORP", "WORK_OF_ART"]:
            product_entities.append(ent.text)
        if ent.label_ == "ORG":
            brand_entities.append(ent.text)
            product_entities.append(ent.text)  # ORG treated as both product and brand if desired

    products.append(", ".join(product_entities))
    brands.append(", ".join(brand_entities))

In [None]:
# Create the new DataFrame
product_brand_df = pd.DataFrame({'products': products, 'brands': brands})

# Display the head of the new DataFrame
print(product_brand_df.head())