<a href="https://colab.research.google.com/github/MissMercyKN/AI-Tools/blob/main/AmazonReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
files.upload()  # Upload your kaggle.json here


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"arvinemoraa","key":"d02d3fed4197c8d193b3c7ae7f0dd62e"}'}

In [4]:
import os

os.makedirs("/root/.kaggle", exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


In [5]:
!pip install kaggle
!kaggle datasets download -d bittlingmayer/amazonreviews -p amazon_data --unzip


Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to amazon_data
 99% 488M/493M [00:05<00:00, 100MB/s] 
100% 493M/493M [00:05<00:00, 90.0MB/s]


In [6]:
import os
print(os.listdir('amazon_data'))
# Expect a CSV like: reviews.csv, or similar


['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [9]:
import pandas as pd
import os # Import os again for clarity within this cell

# Adjust filename as needed based on the output of the previous cell
# Assuming the output of os.listdir('amazon_data') was ['test.ft.txt.bz2', 'train.ft.txt.bz2']
# Replace 'test.ft.txt.bz2' with the actual filename from the output if different
file_list = os.listdir('amazon_data')
if file_list:
    # Assuming we want to read 'test.ft.txt.bz2' or the first file if it's different
    csv_filename = 'test.ft.txt.bz2' # Or file_list[0] if the first file is the target
    filepath = f"amazon_data/{csv_filename}"

    try:
        # Attempt to read the file assuming a space delimiter and two columns (label and text)
        # and handling potential quoting issues.
        # Using `engine='python'` can sometimes be more robust for tricky formats
        # but might be slower for large files. Let's try the C engine first.
        df = pd.read_csv(filepath,
                         sep=' ',        # Try space as a delimiter
                         header=None,      # No header row
                         names=['label', 'review_text'], # Assign column names
                         usecols=[0, 1],   # Only expect two columns
                         quotechar=None,   # No quote character
                         quoting=3,        # Disable quoting (csv.QUOTE_NONE)
                         on_bad_lines='warn' # Warn about bad lines instead of failing
                        )
        print("Columns:", df.columns)
        print(df.shape)
        print(df.head())

    except ParserError as e:
        print(f"ParserError: {e}")
        print(f"Could not parse {csv_filename} as a simple space-separated file with 2 columns.")
        # If the above fails, the file format might be more complex.
        # A more robust approach for label-text data might be to read line by line
        # and split the label from the text at the first space.

        print(f"Attempting to read {csv_filename} line by line...")

        data = []
        with open(filepath, 'rt', encoding='utf-8') as f:
            # Since it's bz2 compressed, pandas should handle decompression.
            # If reading line by line manually, you would need a bz2 opener:
            # import bz2
            # with bz2.open(filepath, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= 1000: # Read only first 1000 lines to inspect
                    break
                parts = line.strip().split(' ', 1) # Split only at the first space
                if len(parts) == 2:
                    label, review_text = parts
                    data.append({'label': label, 'review_text': review_text})
                else:
                    # Handle lines that don't fit the expected format
                    print(f"Skipping line {i+1} due to unexpected format: {line.strip()}")

        if data:
            df = pd.DataFrame(data)
            print("\nSuccessfully parsed data line by line (sample):")
            print("Columns:", df.columns)
            print(df.shape)
            print(df.head())
        else:
             print("\nCould not parse any lines from the file.")


else:
    print("No files found in the 'amazon_data' directory.")

Columns: Index(['label', 'review_text'], dtype='object')
(400000, 2)
        label review_text
0  __label__2       Great
1  __label__2         One
2  __label__1   Batteries
3  __label__2       works
4  __label__2       Great


In [10]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Use more reviews (adjust to your dataset's actual column name, e.g., 'review_body', 'review', etc.)
# Corrected column name from 'review' to 'review_text'
sample_reviews = df['review_text'].dropna().astype(str).tolist()[:100]

# Rule-based keywords
positive = {"love", "great", "excellent", "amazing", "fantastic", "good", "best", "awesome", "perfect"}
negative = {"bad", "worst", "disappointed", "terrible", "awful", "poor", "hate", "broke", "useless"}

# Loop through reviews
for i, review in enumerate(sample_reviews, 1):
    doc = nlp(review)

    # Named Entity Recognition: focus on brand/product/company
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ("ORG", "PRODUCT")]

    # Rule-based sentiment analysis
    tokens = {token.text.lower() for token in doc}
    if tokens & positive:
        sentiment = "Positive"
    elif tokens & negative:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    # Display result
    print(f"\n🔹 Review #{i}")
    print(f"📝 Text: {review[:150]}{'...' if len(review) > 150 else ''}")
    print(f"🏷️ Entities: {entities if entities else 'None'}")
    print(f"📊 Sentiment: {sentiment}")


🔹 Review #1
📝 Text: Great
🏷️ Entities: None
📊 Sentiment: Positive

🔹 Review #2
📝 Text: One
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #3
📝 Text: Batteries
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #4
📝 Text: works
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #5
📝 Text: Great
🏷️ Entities: None
📊 Sentiment: Positive

🔹 Review #6
📝 Text: DVD
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #7
📝 Text: Incorrect
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #8
📝 Text: DVD
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #9
📝 Text: Unique
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #10
📝 Text: Not
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #11
📝 Text: Great
🏷️ Entities: None
📊 Sentiment: Positive

🔹 Review #12
📝 Text: Not!:
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #13
📝 Text: A
🏷️ Entities: None
📊 Sentiment: Neutral

🔹 Review #14
📝 Text: TRULY
🏷️ Entities: [('TRULY', 'ORG')]
📊 Sentiment: Neutral

🔹 Review #15
📝 Text: didn't
🏷️ Entities: None
📊 Sentimen