<a href="https://colab.research.google.com/github/Mmabatho/AI-For-Software-Engineeering-Week-3/blob/brian/Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Downloading Amazon Review Data from Kaggle

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
bittlingmayer_amazonreviews_path = kagglehub.dataset_download('bittlingmayer/amazonreviews')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/bittlingmayer/amazonreviews?dataset_version_number=7...


100%|██████████| 493M/493M [00:07<00:00, 69.2MB/s]

Extracting files...





Data source import complete.


In [8]:
# Checking the path
bittlingmayer_amazonreviews_path

'/root/.cache/kagglehub/datasets/bittlingmayer/amazonreviews/versions/7'

### Importing necessary libraries

In [9]:
import spacy
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import os
import bz2

In [7]:
# Show files in the path
print(os.listdir(bittlingmayer_amazonreviews_path))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


### Loading the datasets into DataFrames

In [10]:
# Extracting data from the file paths
def extractData(filename):
    data = []
    with bz2.open(filename, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                label, text = parts
                data.append((label, text))
    df = pd.DataFrame(data, columns=['label', 'text'])
    df['label'] = df['label'].str.extract(r'__label__(\d+)').astype(int) # Fix the regex to correctly extract the digit
    return df

train_file = os.path.join(bittlingmayer_amazonreviews_path, 'train.ft.txt.bz2')
test_file = os.path.join(bittlingmayer_amazonreviews_path, 'test.ft.txt.bz2')

train_df = extractData(train_file)
test_df = extractData(test_file)

print("Train DataFrame head:")
print(train_df.head())
print("\nTest DataFrame head:")
print(test_df.head())

Train DataFrame head:
   label                                               text
0      2  Stuning even for the non-gamer: This sound tra...
1      2  The best soundtrack ever to anything.: I'm rea...
2      2  Amazing!: This soundtrack is my favorite music...
3      2  Excellent Soundtrack: I truly like this soundt...
4      2  Remember, Pull Your Jaw Off The Floor After He...

Test DataFrame head:
   label                                               text
0      2  Great CD: My lovely Pat has one of the GREAT v...
1      2  One of the best game music soundtracks - for a...
2      1  Batteries died within a year ...: I bought thi...
3      2  works fine, but Maha Energy is better: Check o...
4      2  Great for the non-audiophile: Reviewed quite a...


### Data Preliminaries

In [15]:
# Check the info of the dataframes
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   label   int64 
 1   text    object
dtypes: int64(1), object(1)
memory usage: 54.9+ MB


In [11]:
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,1800000
1,1800000


In [12]:
test_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,200000
1,200000


In [14]:
# Checking for duplicates in the dataframes
train_df.duplicated().sum()

np.int64(0)

### Named Entity Recognition (NER)

In [16]:
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
  doc = nlp(text)
  entities = {}
  for ent in doc.ents:
    if ent.label_ in ["PRODUCT", "ORG"]: # ORG often captures brand names
      if ent.label_ not in entities:
        entities[ent.label_] = []
      entities[ent.label_].append(ent.text)
  return entities

# Apply NER to a sample of the training data
# Processing the entire dataset might be slow, so we'll process a smaller portion.
sample_size = 20000
train_df_sample = train_df.sample(n=sample_size, random_state=42).copy() # Use .copy() to avoid SettingWithCopyWarning
train_df_sample['entities'] = train_df_sample['text'].apply(extract_entities)

print("\nSample DataFrame with extracted entities:")
print(train_df_sample.head())


Sample DataFrame with extracted entities:
         label                                               text  \
2079998      1  Expensive Junk: This product consists of a pie...   
1443106      1  Toast too dark: Even on the lowest setting, th...   
3463669      2  Excellent imagery...dumbed down story: I enjoy...   
2914699      1  Are we pretending everyone is married?: The au...   
1603231      1  Not worth your time: Might as well just use a ...   

                              entities  
2079998  {'ORG': ['Velcro', 'Amazon']}  
1443106         {'ORG': ['Cuisinart']}  
3463669                             {}  
2914699                             {}  
1603231                             {}  
