#OPTIMIZATING THE ENTIRE PROCESS



In [None]:
from google.colab import drive
import gzip
import json
import csv
import logging

# Mount Google Drive
drive.mount('/content/drive')

# Configure logging
logging.basicConfig(filename="reviews.log", level=logging.INFO)

# File paths
input_file = "/content/drive/MyDrive/Cell_Phones_and_Accessories.jsonl.gz"
output_file = "raw_reviews.csv"

# CSV headers
csv_headers = ["parent_asin", "rating", "text"]

# Process JSONL file and write to CSV
with gzip.open(input_file, "rt", encoding='utf-8') as f, open(output_file, "w", newline="", encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
    writer.writeheader()

    for i, line in enumerate(f):
        try:
            # Parse JSON line
            review = json.loads(line.strip())

            # Extract relevant fields
            parent_asin = review.get("parent_asin", "")
            rating = review.get("rating", 0)
            text = review.get("text", "")

            # Write to CSV
            writer.writerow({
                "parent_asin": parent_asin,
                "rating": rating,
                "text": text
            })

        except json.JSONDecodeError as e:
            logging.error(f"Line {i}: JSON parsing error - {str(e)}")
            continue
        except Exception as e:
            logging.error(f"Line {i}: Error - {str(e)}")
            continue

print(f"Saved to {output_file}")

# Copy the output CSV to Google Drive
!cp raw_reviews.csv /content/drive/MyDrive/raw_reviews.csv
print("Copied to /content/drive/MyDrive/raw_reviews.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved to raw_reviews.csv
Copied to /content/drive/MyDrive/raw_reviews.csv


#Metadata

In [None]:
# Set up logging
logging.basicConfig(filename="metadata.log", level=logging.INFO)

# File paths
metadata_file = "/content/drive/MyDrive/meta_Cell_Phones_and_Accessories.jsonl.gz"
output_file = "metadata1.csv"

# Define CSV headers
csv_headers = ["parent_asin", "title", "price", "average_rating", "rating_number"]

# Price threshold to filter smartphones
PRICE_THRESHOLD = 100.0

# Process metadata
with gzip.open(metadata_file, "rt") as f, open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
    writer.writeheader()
    for i, line in enumerate(f):
        try:
            metadata = json.loads(line.strip())

            # Get and clean the raw price
            raw_price = str(metadata.get("price", "")).strip()
            match = re.search(r"[\d,.]+", raw_price)
            if match:
                try:
                    price = float(match.group(0).replace(",", ""))
                except ValueError:
                    price = 0.0
            else:
                price = 0.0

            # Filter by price
            if price >= PRICE_THRESHOLD:
                writer.writerow({
                    "parent_asin": metadata.get("parent_asin", ""),
                    "title": metadata.get("title", ""),
                    "price": price,
                    "average_rating": metadata.get("average_rating", 0.0),
                    "rating_number": metadata.get("rating_number", 0)
                })
        except Exception as e:
            logging.error(f"Metadata line {i}: Error - {str(e)}")
            continue

print(f"Saved smartphones >= ${PRICE_THRESHOLD} to {output_file}")

# Save to Google Drive
!cp metadata1.csv /content/drive/MyDrive/metadata.csv
print("Copied to /content/drive/MyDrive/metadata.csv")


Saved smartphones >= $100.0 to metadata1.csv
Copied to /content/drive/MyDrive/metadata.csv


FileNotFoundError: [Errno 2] No such file or directory: 'metadata.csv'

#Merging Reviews With Metdata

In [None]:
# File paths
reviews_output_file = "/content/drive/MyDrive/raw_reviews.csv"
metadata_file = "/content/drive/MyDrive/metadata.csv"
merged_output_file = "merged_reviews_metadata.csv"

# Merge raw_reviews.csv with metadata.csv
reviews_df = pd.read_csv(reviews_output_file)
metadata_df = pd.read_csv(metadata_file)
merged_df = pd.merge(reviews_df, metadata_df, on="parent_asin", how="inner")
merged_df.to_csv(merged_output_file, index=False)

# Copy merged CSV to Google Drive
!cp {merged_output_file} /content/drive/MyDrive/{merged_output_file}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Preprocessing and Sentiment Analysis (TextBlob)


In [None]:
# Install dependencies
!pip install textblob contractions nltk
import contractions

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# File paths
input_file = "/content/drive/MyDrive/merged_reviews_metadata.csv"
output_file = "dss_data_smartphones_main.csv"

# Define CSV headers for output
csv_headers = [
    "parent_asin", "avg_rating", "review_count",
    "camera_sentiment", "camera_mentions",
    "battery_sentiment", "battery_mentions",
    "display_sentiment", "display_mentions",
    "performance_sentiment", "performance_mentions",
    "build_sentiment", "build_mentions",
    "storage_sentiment", "storage_mentions"
]

# Define feature synonyms
feature_synonyms = {
    "camera": ["photo", "picture", "lens", "photography", "image", "shot", "capture"],
    "battery": ["power", "charge", "life", "juice", "lasts", "drain"],
    "display": ["screen", "touchscreen", "resolution", "panel", "visual"],
    "performance": ["speed", "fast", "lag", "processor", "quick"],
    "build": ["design", "material", "durability", "quality", "sturdy", "strong"],
    "storage": ["memory", "space", "capacity", "gb", "expandable"]
}

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    text = contractions.fix(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    feature_words = set()
    for feature, synonyms in feature_synonyms.items():
        feature_words.add(feature)
        feature_words.update(synonyms)
    tokens = [word for word in text.split() if word not in stop_words or word in feature_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Sentiment analysis function
def feature_sentiment(texts, feature):
    keywords = [feature] + feature_synonyms.get(feature, [])
    feature_texts = [t for t in texts if any(keyword.lower() in t.lower() for keyword in keywords)]
    return (
        sum(TextBlob(t).sentiment.polarity for t in feature_texts) / len(feature_texts) if feature_texts else 0,
        len(feature_texts)
    )

# Process reviews from merged CSV
review_summaries = {}
df = pd.read_csv(input_file)
for _, row in df.iterrows():
    parent_asin = row["parent_asin"]
    rating = float(row.get("rating", 0))
    raw_text = row.get("text", "")

    # Preprocess the review text
    processed_text = preprocess_text(raw_text)

    if parent_asin not in review_summaries:
        review_summaries[parent_asin] = {"ratings": [], "texts": []}
    review_summaries[parent_asin]["ratings"].append(rating)
    review_summaries[parent_asin]["texts"].append(processed_text)

# Write results to CSV
with open(output_file, "w", newline="", encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
    writer.writeheader()
    for parent_asin, summary in review_summaries.items():
        summary["avg_rating"] = sum(summary["ratings"]) / len(summary["ratings"])
        summary["review_count"] = len(summary["ratings"])
        cam_sent, cam_count = feature_sentiment(summary["texts"], "camera")
        bat_sent, bat_count = feature_sentiment(summary["texts"], "battery")
        disp_sent, disp_count = feature_sentiment(summary["texts"], "display")
        perf_sent, perf_count = feature_sentiment(summary["texts"], "performance")
        build_sent, build_count = feature_sentiment(summary["texts"], "build")
        stor_sent, stor_count = feature_sentiment(summary["texts"], "storage")
        writer.writerow({
            "parent_asin": parent_asin,
            "avg_rating": summary["avg_rating"],
            "review_count": summary["review_count"],
            "camera_sentiment": cam_sent, "camera_mentions": cam_count,
            "battery_sentiment": bat_sent, "battery_mentions": bat_count,
            "display_sentiment": disp_sent, "display_mentions": disp_count,
            "performance_sentiment": perf_sent, "performance_mentions": perf_count,
            "build_sentiment": build_sent, "build_mentions": build_count,
            "storage_sentiment": stor_sent, "storage_mentions": stor_count
        })

print(f"Saved to {output_file}")

# Save to Google Drive
!cp dss_data_smartphones_main.csv.csv /content/drive/MyDrive/dss_data_smartphones_main.csv
print("Copied to /content/drive/MyDrive/dss_data_smartphones_main.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Saved to dss_data_smartphones_main.csv
cp: cannot stat 'dss_data_smartphones_main.csv.csv': No such file or directory
Copied to /content/drive/MyDrive/dss_data_smartphones_main.csv


# Sentiment Analysis (RoBERTa)

In [None]:
# File paths
reviews_file = "/content/drive/MyDrive/merged_reviews_metadata.csv"
summaries_metadata_file = "/content/drive/MyDrive/dss_data_smartphones_main.csv"
output_metadata_file = "/content/drive/MyDrive/dss_data_smartphones_main_roberta.csv"


Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

2025-05-04 19:58:37,388 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

2025-05-04 20:00:01,758 - BERTopic - Embedding - Completed ✓
2025-05-04 20:00:01,759 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-04 20:01:58,141 - BERTopic - Dimensionality - Completed ✓
2025-05-04 20:01:58,145 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-04 20:55:07,072 - BERTopic - Cluster - Completed ✓
2025-05-04 20:55:07,106 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-04 20:55:10,607 - BERTopic - Representation - Completed ✓


Saved topic-enhanced CSV to /content/drive/MyDrive/dss_data_smartphones_bert.csv
Saved BERTopic model to /content/drive/MyDrive/bertopic_model1


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: content/drive/MyDrive/bertopic_model1 (deflated 14%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load the RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [None]:
#Defined same as for the TextBlob
feature_synonyms = {
    "camera": ["photo", "picture", "lens", "photography", "image", "shot", "capture"],
    "battery": ["power", "charge", "life", "juice", "lasts", "drain"],
    "display": ["screen", "touchscreen", "resolution", "panel", "visual"],
    "performance": ["speed", "fast", "lag", "processor", "quick"],
    "build": ["design", "material", "durability", "quality", "sturdy", "strong"],
    "storage": ["memory", "space", "capacity", "gb", "expandable"]
}

In [None]:
# Function to check token length
def get_token_length(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return len(tokens)

def predict_sentiment(text):
    # Skip if token length is more than 512
    if get_token_length(text) > 512:
        return None

    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt", truncation=False, padding=True, max_length=512)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).numpy()[0]

    # Convert to a sentiment score: negative (0) -> -1, neutral (1) -> 0, positive (2) -> 1
    labels = np.array([-1, 0, 1])
    sentiment_score = np.sum(probs * labels)
    return sentiment_score

In [None]:
# Function to extract sentiment for a specific feature from a review
def extract_feature_sentiment(review_text, feature_keywords):
    sentences = sent_tokenize(review_text)
    # Filter sentences mentioning the feature
    feature_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in feature_keywords)]

    if not feature_sentences:
        return 0

    # Predict sentiment for each relevant sentence, skipping those over 512 tokens
    sentiment_scores = []
    for sentence in feature_sentences:
        score = predict_sentiment(sentence)
        if score is not None:
            sentiment_scores.append(score)

    return np.mean(sentiment_scores) if sentiment_scores else 0

In [None]:
# Load review data
reviews_df = pd.read_csv(reviews_file)

# Clean the text column
reviews_df["text"] = reviews_df["text"].fillna("").astype(str)

In [None]:
# Step 1: Extract sentiment for each feature per review
for feature, keywords in features.items():
    print(f"Processing sentiment for {feature}...")
    reviews_df[f"{feature}_sentiment_roberta"] = reviews_df["text"].apply(
        lambda x: extract_feature_sentiment(x, keywords)
    )

# Step 2: Aggregate sentiment scores per product (parent_asin)
sentiment_columns = [f"{feature}_sentiment_roberta" for feature in features.keys()]
product_sentiments = reviews_df.groupby("parent_asin")[sentiment_columns].mean().reset_index()

# Step 3: Calculate mentions per feature
for feature, keywords in features.items():
    product_sentiments[f"{feature}_mentions_roberta"] = reviews_df.groupby("parent_asin")["text"].apply(
        lambda x: sum(any(keyword in review.lower() for keyword in keywords) for review in x)
    ).reset_index(drop=True)

#Plotting the Graphs for Analysis

In [None]:
#Various Plots to analysis the TextBlob Sentiments
sns.set(style="whitegrid")
csv_file = "/content/drive/MyDrive/dss_data_smartphones_main_roberta1.csv"
df = pd.read_csv(csv_file).fillna(0)
top_products = df.nlargest(5, 'review_count')
features = ['camera', 'battery', 'display', 'performance', 'build', 'storage']

# 1. Bar Chart: Sentiment per Feature (Top 5 Products)
plt.figure(figsize=(10, 5))
sentiment_cols = [f'{feat}_sentiment' for feat in features]
melted = top_products.melt(id_vars=['parent_asin'], value_vars=sentiment_cols,
                           var_name='Feature', value_name='Sentiment')
melted['Feature'] = melted['Feature'].str.replace('_sentiment', '')
sns.barplot(data=melted, x='parent_asin', y='Sentiment', hue='Feature')
plt.title('Sentiment per Feature (Top 5 Products)')
plt.xlabel('Product')
plt.ylabel('Sentiment')
plt.xticks(rotation=45)
plt.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/sentiment_per_feature.png')
plt.show()

# 2. Bar Chart: Mentions per Feature (Top 5 Products)
plt.figure(figsize=(10, 5))
mention_cols = [f'{feat}_mentions' for feat in features]
melted = top_products.melt(id_vars=['parent_asin'], value_vars=mention_cols,
                           var_name='Feature', value_name='Mentions')
melted['Feature'] = melted['Feature'].str.replace('_mentions', '')
sns.barplot(data=melted, x='parent_asin', y='Mentions', hue='Feature')
plt.title('Mentions per Feature (Top 5 Products)')
plt.xlabel('Product')
plt.ylabel('Mentions')
plt.xticks(rotation=45)
plt.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/mentions_per_feature.png')
plt.show()

#3. scatter plots
# List of sentiment columns to plot
sentiment_columns = [
    'camera_sentiment', 'battery_sentiment', 'display_sentiment',
    'performance_sentiment', 'build_sentiment', 'storage_sentiment'
]

# Melt the DataFrame to long format
df_melted = df[['avg_rating', 'review_count'] + sentiment_columns].melt(
    id_vars=['avg_rating', 'review_count'],
    value_vars=sentiment_columns,
    var_name='Sentiment Feature',
    value_name='Sentiment Score'
)
g = sns.FacetGrid(df_melted, col='Sentiment Feature', col_wrap=3, height=4, sharey=True)
g.map_dataframe(
    sns.scatterplot,
    x='avg_rating',
    y='Sentiment Score',
    size='review_count',
    sizes=(50, 500),
    alpha=0.6
)
g.set_titles('{col_name}')
g.set_axis_labels('Average Rating', 'Sentiment Score')
g.fig.suptitle('Sentiment_TextBlob vs. Average Rating Across Features', y=1.05)
g.tight_layout()
g.fig.savefig('/content/drive/MyDrive/all_sentiments_vs_rating_facet_scatter.png')
plt.show()

In [None]:
#Various Plots for RoBERTa Sentiments

# 1. Bar Chart: Sentiment per Feature (Top 5 Products)
plt.figure(figsize=(10, 5))
sentiment_cols = [f'{feat}_sentiment_roberta' for feat in features]
melted = top_products.melt(id_vars=['parent_asin'], value_vars=sentiment_cols,
                           var_name='Feature', value_name='Sentiment')
melted['Feature'] = melted['Feature'].str.replace('_sentiment_roberta', '')
sns.barplot(data=melted, x='parent_asin', y='Sentiment', hue='Feature')
plt.title('Sentiment per Feature (Top 5 Products)')
plt.xlabel('Product')
plt.ylabel('Sentiment')
plt.xticks(rotation=45)
plt.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/sentiment_per_feature.png')
plt.show()

# 2. Bar Chart: Mentions per Feature (Top 5 Products)
plt.figure(figsize=(10, 5))
mention_cols = [f'{feat}_mentions_roberta' for feat in features]
melted = top_products.melt(id_vars=['parent_asin'], value_vars=mention_cols,
                           var_name='Feature', value_name='Mentions')
melted['Feature'] = melted['Feature'].str.replace('_mentions_roberta', '')
sns.barplot(data=melted, x='parent_asin', y='Mentions', hue='Feature')
plt.title('Mentions per Feature (Top 5 Products)')
plt.xlabel('Product')
plt.ylabel('Mentions')
plt.xticks(rotation=45)
plt.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/mentions_per_feature.png')
plt.show()

#3. scatter plots
# List of sentiment columns to plot
sentiment_columns = [
    'camera_sentiment_roberta', 'battery_sentiment_roberta', 'display_sentiment_roberta',
    'performance_sentiment_roberta', 'build_sentiment_roberta', 'storage_sentiment_roberta'
]

# Melt the DataFrame to long format
df_melted = df[['avg_rating', 'review_count'] + sentiment_columns].melt(
    id_vars=['avg_rating', 'review_count'],
    value_vars=sentiment_columns,
    var_name='Sentiment Feature',
    value_name='Sentiment Score'
)

g = sns.FacetGrid(df_melted, col='Sentiment Feature', col_wrap=3, height=4, sharey=True)
g.map_dataframe(
    sns.scatterplot,
    x='avg_rating',
    y='Sentiment Score',
    size='review_count',
    sizes=(50, 500),
    alpha=0.6
)
g.set_titles('{col_name}')
g.set_axis_labels('Average Rating', 'Sentiment Score')
g.fig.suptitle('Sentiment_TextBlob vs. Average Rating Across Features', y=1.05)
g.tight_layout()
g.fig.savefig('/content/drive/MyDrive/all_sentiments_vs_rating_facet_scatter.png')
plt.show()