# **1. Data Cleaning & Pre-Processing**

**Install All Librearies**

In [None]:
import re
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer

**Read and Load the Dataset**

In [None]:
#Load the CSV Path
df = pd.read_csv('/content/merged_data.csv')

In [None]:
df.head()

Unnamed: 0,category,brand,model,review_date,main_review_title,review_text,review_stars
0,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 23 September 2024,Good quality,Very good and comfortable fit in the ears. It ...,5.0 out of 5 stars
1,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 20 September 2024,Waste of money,sound quality is bad really bad,1.0 out of 5 stars
2,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,This is real or fake product I don’t no sound ...,,2.0 out of 5 stars
3,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,Missing magnetic case,"Sound is amazing, noise cancellation working g...",4.0 out of 5 stars
4,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 30 September 2024,Good,Nice product,5.0 out of 5 stars


**Exploratory Data Analysis**

In [None]:
#Shape of the Data
df.shape

(3867, 7)

In [None]:
#Show all the Column names
df.columns

Index(['category', 'brand', 'model', 'review_date', 'main_review_title',
       'review_text', 'review_stars'],
      dtype='object')

In [None]:
#Total rows in the data
print('length of data is', len(df))

length of data is 3867


In [None]:
#Information of Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3867 entries, 0 to 3866
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   category           3867 non-null   object
 1   brand              3867 non-null   object
 2   model              3867 non-null   object
 3   review_date        3867 non-null   object
 4   main_review_title  3759 non-null   object
 5   review_text        3790 non-null   object
 6   review_stars       3867 non-null   object
dtypes: object(7)
memory usage: 211.6+ KB


In [None]:
print('Count of columns in the data is:  ', len(df.columns))
print('Count of rows in the data is:  ', len(df))

Count of columns in the data is:   7
Count of rows in the data is:   3867


**Data Preprocessing**

**1. Lower case**

In [None]:
# df["main_review_title"] = df["main_review_title"].str.lower()
# df.head()

In [None]:
# df["review_text"] = df["review_text"].str.lower()
# df.sample(5)

In [None]:
# Apply Lowe Case Function
df = df.assign(
    main_review_title=df["main_review_title"].str.lower(),
    review_text=df["review_text"].str.lower()
)

In [None]:
df.head()

Unnamed: 0,category,brand,model,review_date,main_review_title,review_text,review_stars
0,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 23 September 2024,good quality,very good and comfortable fit in the ears. it ...,5.0 out of 5 stars
1,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 20 September 2024,waste of money,sound quality is bad really bad,1.0 out of 5 stars
2,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,this is real or fake product i don’t no sound ...,,2.0 out of 5 stars
3,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,missing magnetic case,"sound is amazing, noise cancellation working g...",4.0 out of 5 stars
4,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 30 September 2024,good,nice product,5.0 out of 5 stars


2. Remove Punctuation

In [None]:
import string
import re

In [None]:
# Function to clean text by removing punctuation but keeping numbers
def clean_text(text):
    if pd.isna(text):
        return ''
    # Remove punctuation but keep numbers
    text_without_punct = re.sub(r'[^\w\s]', '', text)
    return text_without_punct.lower()

In [None]:
# Apply the cleaning function to both columns
df = df.assign(
    main_review_title=df["main_review_title"].apply(clean_text),
    review_text=df["review_text"].apply(clean_text)
)

In [None]:
df.head()

Unnamed: 0,category,brand,model,review_date,main_review_title,review_text,review_stars
0,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 23 September 2024,good quality,very good and comfortable fit in the ears it d...,5.0 out of 5 stars
1,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 20 September 2024,waste of money,sound quality is bad really bad,1.0 out of 5 stars
2,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,this is real or fake product i dont no sound q...,,2.0 out of 5 stars
3,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,missing magnetic case,sound is amazing noise cancellation working go...,4.0 out of 5 stars
4,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 30 September 2024,good,nice product,5.0 out of 5 stars


In [None]:
# df.to_csv('cleaned_data.csv', index=False)

**3. Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Function to lemmatize words
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
# Apply the cleaning and lemmatization functions to both columns
df = df.assign(
    main_review_title=df["main_review_title"].apply(clean_text).apply(lemmatize_words),
    review_text=df["review_text"].apply(clean_text).apply(lemmatize_words)
)

In [None]:
df.head()

Unnamed: 0,category,brand,model,review_date,main_review_title,review_text,review_stars
0,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 23 September 2024,good quality,very good and comfortable fit in the ear it do...,5.0 out of 5 stars
1,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 20 September 2024,waste of money,sound quality is bad really bad,1.0 out of 5 stars
2,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,this is real or fake product i dont no sound q...,,2.0 out of 5 stars
3,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 21 September 2024,missing magnetic case,sound is amazing noise cancellation working go...,4.0 out of 5 stars
4,Airbuds,Apple,AirPods 4 Wireless Earbuds,Reviewed in India on 30 September 2024,good,nice product,5.0 out of 5 stars


**4. Spelling Correction**

In [None]:
# !pip install pyspellchecker

In [None]:
# from spellchecker import SpellChecker

In [None]:
# Initialize the spell checker
# spell = SpellChecker()

In [None]:
# # Function to correct spellings
# def correct_spellings(text):
#     if pd.isna(text) or text is None:
#         return [""]
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return corrected_text




In [None]:
# # Apply spelling correction to both columns
# df = df.assign(
#     main_review_title=df["main_review_title"]
#         .apply(correct_spellings),
#     review_text=df["review_text"]
#         .apply(correct_spellings)
# )

In [None]:
# To view the updated DataFrame
# df.head()

In [None]:
# # Convert lists into strings
# df['main_review_title'] = df['main_review_title'].apply(
#     lambda x: " ".join(str(i) for i in x) if isinstance(x, list) else ""
# )

In [None]:
# df['review_text'] = df['review_text'].apply(
#     lambda x: " ".join(str(i) for i in x) if isinstance(x, list) else ""
# )


In [None]:
# df.head()

In [None]:
#combine review title and review text
df['combined'] = df['main_review_title'] + ' ' + df['review_text']
df.sample(5)

Unnamed: 0,category,brand,model,review_date,main_review_title,review_text,review_stars,combined
13,Airbuds,Apple,Apple AirPods Pro,Reviewed in India on 21 July 2023,an immersive audio experience,a an avid music lover and tech enthusiast i re...,5.0 out of 5 stars,an immersive audio experience a an avid music ...
976,Airbuds,Samsung,Samsung Galaxy Buds2 Pro,Reviewed in India on 12 August 2024,best earphone in this segment,buy it dont think much it worth every penny ch...,5.0 out of 5 stars,best earphone in this segment buy it dont thin...
2562,Smartphones,Samsung,Samsung Galaxy S24 5G AI Smartphone,Reviewed in India on 10 April 2024,phone review,phone s24 is excellent in a much a being sleek...,4.0 out of 5 stars,phone review phone s24 is excellent in a much ...
603,Airbuds,One Plus,OnePlus Nord Buds 2,Reviewed in India on 27 September 2024,oneplus nord bud 2r a budgetfriendly audio gem,oneplus nord bud 2r review build quality excel...,5.0 out of 5 stars,oneplus nord bud 2r a budgetfriendly audio gem...
2293,Smartphones,Oneplus,Oneplus 12R,Reviewed in India on 8 August 2024,value for money,i got the oneplus 12r 16gb ram 256storage for ...,4.0 out of 5 stars,value for money i got the oneplus 12r 16gb ram...


In [None]:
#Save clean data
df.to_csv('cleaned_data.csv', index=False)

In [None]:
# #WORD CLOUD CODE

# from wordcloud import WordCloud
# from wordcloud import STOPWORDS

# # Wordcloud with positive tweets
# positive_tweets = df['tweet'][df["sentiment"] == 'POS']
# stop_words = ["https", "co", "RT"] + list(STOPWORDS)
# positive_wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", stopwords = stop_words).generate(str(positive_tweets))
# plt.figure()
# plt.title("Positive Tweets - Wordcloud")
# plt.imshow(positive_wordcloud, interpolation="bilinear")
# plt.axis("off")
# plt.show()

# # Wordcloud with negative tweets
# negative_tweets = df['tweet'][df["sentiment"] == 'NEG']
# stop_words = ["https", "co", "RT"] + list(STOPWORDS)
# negative_wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", stopwords = stop_words).generate(str(negative_tweets))
# plt.figure()
# plt.title("Negative Tweets - Wordcloud")
# plt.imshow(negative_wordcloud, interpolation="bilinear")
# plt.axis("off")
# plt.show()


In [None]:
# cleaned_df = df.copy()

In [None]:
# cleaned_df.head()

# **2. Sentiment Analysis**

**Huuging Face Models**

In [None]:
!pip install transformers



**Hugging Face API Key**

In [None]:
# Set your Hugging Face API key
# os.environ['HUGGINGFACE_API_KEY'] = 'your_huggingface_api_key_here'

In [None]:
import os

In [None]:
# Set your HF TOKEN API key
os.environ['HF_TOKEN'] = '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'

hf_token = os.getenv("HF_TOKEN")
print(hf_token)

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


**1. distilbert/distilbert-base-uncased-finetuned-sst-2-english**

In [None]:
# Use a pipeline
from transformers import pipeline

pipe = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [None]:
# df = pd.read_csv('/content/cleaned_data.csv')

In [None]:
# Initialize an empty list for storing results
results = []

# Apply sentiment analysis and store results
sentiments = []
scores = []

# Initialize counters
l = 0
m = 1

# Loop through the range of the DataFrame
for k in range(0, 3867):
    for i in range(l, m):
        review = df['combined'][i]
        # print(review)

        # Apply sentiment analysis
        # Truncate the review to a maximum of 512 tokens
        result = pipe(review[:512])
        # print(result)
        sentiments.append(result[0]['label'])
        scores.append(result[0]['score'])

        # Check if the result is 'Nan'
        if result[0]['label'] == 'nan':
            print(f"Skipping index {i} due to result being 'Nan'")
            continue
        results.append(result)
        l += 1
        m += 1

# Add the results to the DataFrame
df['model_1_sentiment'] = sentiments
df['model_1_probability'] = scores

# Display the updated DataFrame
print(df.head())

  category  brand                       model  \
0  Airbuds  Apple  AirPods 4 Wireless Earbuds   
1  Airbuds  Apple  AirPods 4 Wireless Earbuds   
2  Airbuds  Apple  AirPods 4 Wireless Earbuds   
3  Airbuds  Apple  AirPods 4 Wireless Earbuds   
4  Airbuds  Apple  AirPods 4 Wireless Earbuds   

                              review_date  \
0  Reviewed in India on 23 September 2024   
1  Reviewed in India on 20 September 2024   
2  Reviewed in India on 21 September 2024   
3  Reviewed in India on 21 September 2024   
4  Reviewed in India on 30 September 2024   

                                   main_review_title  \
0                                       good quality   
1                                     waste of money   
2  this is real or fake product i dont no sound q...   
3                              missing magnetic case   
4                                               good   

                                         review_text        review_stars  \
0  very good and comf

**2. cardiffnlp/twitter-roberta-base-sentiment-latest**

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest",truncation=True,max_length=512)

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [None]:
# Initialize an empty list for storing results
results = []

# Apply sentiment analysis and store results
sentiments = []
scores = []

# Initialize counters
l = 0
m = 1

# Loop through the range of the DataFrame
for k in range(0, 3867):
    for i in range(l, m):
        review = df['combined'][i]
        # print(review)

        # Apply sentiment analysis
        # Truncate the review to a maximum of 514 tokens
        result = pipe(review)
        # print(result)
        sentiments.append(result[0]['label'])
        scores.append(result[0]['score'])

        # Check if the result is 'Nan'
        if result[0]['label'] == 'nan':
            print(f"Skipping index {i} due to result being 'Nan'")
            continue
        results.append(result)
        l += 1
        m += 1

# Add the results to the DataFrame
df['model_2_sentiment'] = sentiments
df['model_2_probability'] = scores

# Display the updated DataFrame
print(df.head())

  category  brand                       model  \
0  Airbuds  Apple  AirPods 4 Wireless Earbuds   
1  Airbuds  Apple  AirPods 4 Wireless Earbuds   
2  Airbuds  Apple  AirPods 4 Wireless Earbuds   
3  Airbuds  Apple  AirPods 4 Wireless Earbuds   
4  Airbuds  Apple  AirPods 4 Wireless Earbuds   

                              review_date  \
0  Reviewed in India on 23 September 2024   
1  Reviewed in India on 20 September 2024   
2  Reviewed in India on 21 September 2024   
3  Reviewed in India on 21 September 2024   
4  Reviewed in India on 30 September 2024   

                                   main_review_title  \
0                                       good quality   
1                                     waste of money   
2  this is real or fake product i dont no sound q...   
3                              missing magnetic case   
4                                               good   

                                         review_text        review_stars  \
0  very good and comf

**3.finiteautomata/bertweet-base-sentiment-analysis**

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="finiteautomata/bertweet-base-sentiment-analysis",truncation=True)

config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [None]:
# Initialize an empty list for storing results
results = []

# Apply sentiment analysis and store results
sentiments = []
scores = []

# Initialize counters
l = 0
m = 1

# Loop through the range of the DataFrame
for k in range(0, 3867):
    for i in range(l, m):
        review = df['combined'][i]
        # print(review)

        # Apply sentiment analysis, relying on the pipeline's truncation
        result = pipe(review[:512])
        # print(result)
        sentiments.append(result[0]['label'])
        scores.append(result[0]['score'])

        # Check if the result is 'Nan'
        if result[0]['label'] == 'nan':
            print(f"Skipping index {i} due to result being 'Nan'")
            continue
        results.append(result)
        l += 1
        m += 1

# Add the results to the DataFrame
df['model_3_sentiment'] = sentiments
df['model_3_probability'] = scores

# Display the updated DataFrame
print(df.head())

  category  brand                       model  \
0  Airbuds  Apple  AirPods 4 Wireless Earbuds   
1  Airbuds  Apple  AirPods 4 Wireless Earbuds   
2  Airbuds  Apple  AirPods 4 Wireless Earbuds   
3  Airbuds  Apple  AirPods 4 Wireless Earbuds   
4  Airbuds  Apple  AirPods 4 Wireless Earbuds   

                              review_date  \
0  Reviewed in India on 23 September 2024   
1  Reviewed in India on 20 September 2024   
2  Reviewed in India on 21 September 2024   
3  Reviewed in India on 21 September 2024   
4  Reviewed in India on 30 September 2024   

                                   main_review_title  \
0                                       good quality   
1                                     waste of money   
2  this is real or fake product i dont no sound q...   
3                              missing magnetic case   
4                                               good   

                                         review_text        review_stars  \
0  very good and comf

**4. mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis**

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",truncation=True)

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



In [None]:
# Initialize an empty list for storing results
results = []

# Apply sentiment analysis and store results
sentiments = []
scores = []

# Initialize counters
l = 0
m = 1

# Loop through the range of the DataFrame
for k in range(0, 3867):
    for i in range(l, m):
        review = df['combined'][i]
        # print(review)

        # Apply sentiment analysis
        # Truncate the review to a maximum of 512 tokens
        result = pipe(review[:512])
        # print(result)
        sentiments.append(result[0]['label'])
        scores.append(result[0]['score'])

        # Check if the result is 'Nan'
        if result[0]['label'] == 'nan':
            print(f"Skipping index {i} due to result being 'Nan'")
            continue
        results.append(result)
        l += 1
        m += 1

# Add the results to the DataFrame
df['model_4_sentiment'] = sentiments
df['model_4_probability'] = scores

# Display the updated DataFrame
print(df.head())

  category  brand                       model  \
0  Airbuds  Apple  AirPods 4 Wireless Earbuds   
1  Airbuds  Apple  AirPods 4 Wireless Earbuds   
2  Airbuds  Apple  AirPods 4 Wireless Earbuds   
3  Airbuds  Apple  AirPods 4 Wireless Earbuds   
4  Airbuds  Apple  AirPods 4 Wireless Earbuds   

                              review_date  \
0  Reviewed in India on 23 September 2024   
1  Reviewed in India on 20 September 2024   
2  Reviewed in India on 21 September 2024   
3  Reviewed in India on 21 September 2024   
4  Reviewed in India on 30 September 2024   

                                   main_review_title  \
0                                       good quality   
1                                     waste of money   
2  this is real or fake product i dont no sound q...   
3                              missing magnetic case   
4                                               good   

                                         review_text        review_stars  \
0  very good and comf

**5. siebert/sentiment-roberta-large-english**

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", model="siebert/sentiment-roberta-large-english",truncation=True)

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



In [None]:
# Initialize an empty list for storing results
results = []

# Apply sentiment analysis and store results
sentiments = []
scores = []

# Initialize counters
l = 0
m = 1

# Loop through the range of the DataFrame
for k in range(0, 3867):
    for i in range(l, m):
        review = df['combined'][i]
        # print(review)

        # Apply sentiment analysis
        # Truncate the review to a maximum of 512 tokens
        result = pipe(review[:512])
        # print(result)
        sentiments.append(result[0]['label'])
        scores.append(result[0]['score'])

        # Check if the result is 'Nan'
        if result[0]['label'] == 'nan':
            print(f"Skipping index {i} due to result being 'Nan'")
            continue
        results.append(result)
        l += 1
        m += 1

# Add the results to the DataFrame
df['model_5_sentiment'] = sentiments
df['model_5_probability'] = scores

# Display the updated DataFrame
print(df.head())

**Sentiment Analysis Data Processing**

In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('/content/sentiment_analysis.csv')

In [None]:
# df.head()

In [None]:
model_2_mapping = {
    'negative': 'NEGATIVE',
    'neutral': 'NEUTRAL',
    'positive': 'POSITIVE'
}

model_3_mapping = {
    'NEU': 'NEUTRAL',
    'POS': 'POSITIVE',
    'NEG': 'NEGATIVE'
}

model_4_mapping = {
    'negative': 'NEGATIVE',
    'neutral': 'NEUTRAL',
    'positive': 'POSITIVE'
}

model_5_mapping = {
    'negative': 'NEGATIVE',
    'neutral': 'NEUTRAL',
    'positive': 'POSITIVE'
}

# Replace sentiment values in each model's column
df['model_2_sentiment'] = df['model_2_sentiment'].replace(model_2_mapping)
df['model_3_sentiment'] = df['model_3_sentiment'].replace(model_3_mapping)
df['model_4_sentiment'] = df['model_4_sentiment'].replace(model_4_mapping)
df['model_5_sentiment'] = df['model_5_sentiment'].replace(model_5_mapping)

# Display the modified DataFrame
df.head()


In [None]:
# df.to_csv('sentiment_analysis.csv', index=False)

In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('/content/sentiment_analysis_category.csv')

In [None]:
# df.head()

In [None]:
# Define a function to get the majority sentiment
def majority_sentiment(row):
    return row.mode()[0]  # Get the most frequent value

# Apply the function across the specified columns
df['majority_sentiment'] = df[['model_1_sentiment', 'model_2_sentiment',
                                 'model_3_sentiment', 'model_4_sentiment',
                                 'model_5_sentiment']].apply(majority_sentiment, axis=1)

# Display the modified DataFrame
df.head()

In [None]:
#Save the Sentiment Analysis CSV
# df.to_csv('sentiment_analysis.csv', index=False)

In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('/content/sentiment_analysis_mode.csv')

In [None]:
# df.head()

**Make the CSV Based on Model & Sentiment Group**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/sentiment_analysis.csv')

In [None]:
# Group by model and majority_sentiment
grouped = df.groupby(['model', 'majority_sentiment']).agg(
    concatenated_text=('combined', ' '.join),
    total_reviews=('combined', 'count')
).reset_index()

# Count total positive, negative, and neutral reviews
sentiment_counts = df.groupby('model').agg(
    total_positive_reviews=('majority_sentiment', lambda x: (x == 'POSITIVE').sum()),
    total_negative_reviews=('majority_sentiment', lambda x: (x == 'NEGATIVE').sum()),
    total_neutral_reviews=('majority_sentiment', lambda x: (x == 'NEUTRAL').sum())
).reset_index()

# Merge results into a new DataFrame
merge_df = grouped.merge(sentiment_counts, on='model', how='left')

# Select only the desired columns
merge_df = merge_df[['model','majority_sentiment', 'total_reviews', 'total_positive_reviews',
                     'total_negative_reviews', 'total_neutral_reviews',
                     'concatenated_text']]

# Display the final DataFrame
print(merge_df)


                                  model majority_sentiment  total_reviews  \
0                Noise ColorFit Ultra 3           NEGATIVE              1   
1                Noise ColorFit Ultra 3           POSITIVE             49   
2                     Noise Pulse 4 Max           NEGATIVE              2   
3                     Noise Pulse 4 Max            NEUTRAL              1   
4                     Noise Pulse 4 Max           POSITIVE             47   
..                                  ...                ...            ...   
233            realme Smart Watch 2 Pro            NEUTRAL              6   
234            realme Smart Watch 2 Pro           POSITIVE             76   
235  realme Techlife Smart Watch SZ100            NEGATIVE             20   
236  realme Techlife Smart Watch SZ100             NEUTRAL              3   
237  realme Techlife Smart Watch SZ100            POSITIVE             27   

     total_positive_reviews  total_negative_reviews  total_neutral_reviews 

In [None]:
#save the csv file
# merge_df.to_csv('sentiment_analysis_final.csv', index=False)

In [None]:
merge_df.shape

(238, 7)

In [None]:
merge_df['concatenated_text'][0]

'little bit disappointed reduced one star because of design the strip are little bit narrow a per dail design so it look like a woman design and one watchface is not available which is displayed on first picture'

# **3. Make the Summarization Model**

In [None]:
# Install necessary libraries
!pip install langchain_groq



In [None]:
!pip install langchain



In [None]:
import os
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain import PromptTemplate

In [None]:
# Set your GROQ API key
os.environ['GROQ_API_KEY'] = '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'

# Initialize the ChatGroq model
api_key = os.getenv("GROQ_API_KEY")
print(api_key)

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


In [None]:
# #select llm model
# llm = ChatGroq(groq_api_key=api_key, model="Gemma2-9b-It")

In [None]:
# # Sample article text
# ARTICLE = """
# apple product thats all go with the flow if you want to show off buy it if you are an existing android customer never buy an apple product untill and unless u have security trust issues product is good but amazon mobile return facility is not good during the return agent refused to take as he is saying that the iemi data is not matching i asked the currier boy to show iemi which i have filled and same he didnt show it to match also no where in the order summary it is being recorded as return failed and extra amount taken from customerit appears only lucrative offer in return is being given to attract the customer only sam good battery backup no good long time  all that glitters is not gold its a good phone but dont think that its perfect it has its own problems battery battery backup not goodfinger print sensor needed its an iphone so far  very slow to setupheating a lot during setuprest is as usual review after 1 years 1 display  452 durability  353 battery 254 iphone11  iphone6  iphone14as we are addicted to apple we are using iphone else no reason to buy this mobile its apple after all but dont buy from amazon excellent product have been using apple for 8 years nowamazon reviewdont buy from amazonwanted to write down as i regretted every second of this purchasing experience pathetic delivery experience amazon took orders more that they could handle and the delivery associates cannot keep up the system needs more transparency if it was going to take time that could be mentioned and particular slot of 34 hours could be given instead of wasting 2 full days of my time the whole system is either designed in a stupid way or a fraudulent way but not botha product worth 70k was roaring on the streets and no one knew where customer support telling its with delivery associate delivery associates telling they have not been assigned this consignment customer support just keeping the calls on indefinite hold as they had no other answers satisfied with the product received but has some negative opinion i am completely happy with the product however the things which was unsatisfied is that it took more than a week to deliver even after paying in full before delivery and also the bill which was provided after opening was halfy printed as it appears to be half cutted may be printing error the main parts of billing information was not present i hope the next order will avoid any kind of error in future  performance average phone happy with the phone but unhappy with amazon and vendor unhappy because no cost emi still charging gst  interest which was useless decision to buy the phone from the amazon a great phone for minimalist like me i prefer to have some privacy in life i think apple suite is bit more private than google it respects user privacy if you are not using third party apps my options were stock android without google something like lineageos or iphone i tried lineageos but there was always something breaking in my phone it couldnt become my daily driver i wish most of the things in android were open source like maps and some other proprietary google apps in the current state i was left with choosing iphone for long i had debated with myself about buying iphone or not i was using oneplus 7t for last 35 years the phone stopped getting updates and had many bugs the software changed a lot i hated it all these oem betraying consumers made me switch to iphone in the end i cant trust any other brand to have the consistency like apple has thats my take in the phone market i wish there was a consistent bloat free android brand with a great camera great buy i dont recommend to buy  iphone because prices keeps on dropping buy this in sale only to get maximum benefit i bought it for 70k now it is 53 k so loss kya hi bole evening think is good but price is not stable all time
# """

In [None]:
# # Template for summarizing Amazon reviews
# summary_template = """
# Write a concise summary of the following Amazon text reviews. The summary should be between 60 to 70 words:
# Reviews: {reviews}
# """

# # Create a PromptTemplate for summarization
# summary_prompt = PromptTemplate(input_variables=['reviews'], template=summary_template)

In [None]:
# # Create an LLMChain for summarization
# summary_chain = LLMChain(llm=llm, prompt=summary_prompt)

# # Run the summarization
# summary = summary_chain.run({'reviews': ARTICLE})
# print("Summary:")
# print(summary)

In [None]:
import os
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain.chains import LLMChain, load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate

In [None]:
# Sample article text (Amazon reviews)
ARTICLE = """
apple product thats all go with the flow if you want to show off buy it if you are an existing android customer never buy an apple product untill and unless u have security trust issues product is good but amazon mobile return facility is not good during the return agent refused to take as he is saying that the iemi data is not matching i asked the currier boy to show iemi which i have filled and same he didnt show it to match also no where in the order summary it is being recorded as return failed and extra amount taken from customerit appears only lucrative offer in return is being given to attract the customer only sam good battery backup no good long time  all that glitters is not gold its a good phone but dont think that its perfect it has its own problems battery battery backup not goodfinger print sensor needed its an iphone so far  very slow to setupheating a lot during setuprest is as usual review after 1 years 1 display  452 durability  353 battery 254 iphone11  iphone6  iphone14as we are addicted to apple we are using iphone else no reason to buy this mobile its apple after all but dont buy from amazon excellent product have been using apple for 8 years nowamazon reviewdont buy from amazonwanted to write down as i regretted every second of this purchasing experience pathetic delivery experience amazon took orders more that they could handle and the delivery associates cannot keep up the system needs more transparency if it was going to take time that could be mentioned and particular slot of 34 hours could be given instead of wasting 2 full days of my time the whole system is either designed in a stupid way or a fraudulent way but not botha product worth 70k was roaring on the streets and no one knew where customer support telling its with delivery associate delivery associates telling they have not been assigned this consignment customer support just keeping the calls on indefinite hold as they had no other answers satisfied with the product received but has some negative opinion i am completely happy with the product however the things which was unsatisfied is that it took more than a week to deliver even after paying in full before delivery and also the bill which was provided after opening was halfy printed as it appears to be half cutted may be printing error the main parts of billing information was not present i hope the next order will avoid any kind of error in future  performance average phone happy with the phone but unhappy with amazon and vendor unhappy because no cost emi still charging gst  interest which was useless decision to buy the phone from the amazon a great phone for minimalist like me i prefer to have some privacy in life i think apple suite is bit more private than google it respects user privacy if you are not using third party apps my options were stock android without google something like lineageos or iphone i tried lineageos but there was always something breaking in my phone it couldnt become my daily driver i wish most of the things in android were open source like maps and some other proprietary google apps in the current state i was left with choosing iphone for long i had debated with myself about buying iphone or not i was using oneplus 7t for last 35 years the phone stopped getting updates and had many bugs the software changed a lot i hated it all these oem betraying consumers made me switch to iphone in the end i cant trust any other brand to have the consistency like apple has thats my take in the phone market i wish there was a consistent bloat free android brand with a great camera great buy i dont recommend to buy  iphone because prices keeps on dropping buy this in sale only to get maximum benefit i bought it for 70k now it is 53 k so loss kya hi bole evening think is good but price is not stable all time
"""

In [None]:
# Step 1: Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Convert the ARTICLE string to a Document object
docs = text_splitter.split_documents([Document(page_content=ARTICLE)])

In [None]:
# Step 2: Create prompt templates
# Prompt for summarizing each chunk
chunks_prompt = """
Write a concise summary of the following Amazon text reviews.
Ensure the final summary is around 70 words.
<text>
{text}
<text>
"""

map_prompt_template = PromptTemplate(input_variables=['text'], template=chunks_prompt)

In [None]:
# Final prompt to summarize all chunks into a cohesive summary
final_prompt = '''
Based on the summarized reviews, provide an overall summary with the following:
Ensure the final summary is around 70 to 80 words.
Show only text summary.
<text>
{text}
<text>
'''

final_prompt_template = PromptTemplate(input_variables=['text'], template=final_prompt)

In [None]:
# Step 3: Load the summarize chain using Map-Reduce

#select llm model
llm = ChatGroq(groq_api_key=api_key, model="Gemma2-9b-It")

summary_chain = load_summarize_chain(
    llm=llm,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    combine_prompt=final_prompt_template,
    verbose=False
)


In [None]:
# Step 4: Run the summarization
output = summary_chain.run(docs)
# print("Final Summary:")
# print(output)

  output = summary_chain.run(docs)


In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('/content/sentiment_analysis_final (5).csv')

In [None]:
# df.head()

In [None]:
# summary_chain = load_summarize_chain(
#     llm=llm,
#     chain_type="map_reduce",
#     map_prompt=map_prompt_template,
#     combine_prompt=final_prompt_template,
#     verbose=False
# )


In [None]:
# Step 5: Apply summarization to the DataFrame
def summarize_text(text):
    docs = text_splitter.split_documents([Document(page_content=text)])
    return summary_chain.run(docs)

merge_df['summary'] = merge_df['concatenated_text'].apply(summarize_text)



Token indices sequence length is longer than the specified maximum sequence length for this model (1681 > 1024). Running this sequence through the model will result in indexing errors

KeyboardInterrupt



In [None]:
# Clean the summaries by removing "Overall Summary:"
merge_df['summary'] = merge_df['summary'].str.replace(r'\*\*Overall Summary:\*\*\n\n?', '', regex=True).str.strip()


In [None]:
# Display the DataFrame with summaries
merge_df.head()

In [None]:
merge_df.columns

In [None]:
# Merge the DataFrames based on 'model' and 'majority_sentiment'
df = df.merge(merge_df[['model', 'majority_sentiment', 'summary']], on=['model', 'majority_sentiment'], how='left')

# Display the updated df
df.head()

# **4. Add Model Description using LLM**

**1. Smartphone Prompt**

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
  ## Prompt Template
  mobile_prompt = ChatPromptTemplate.from_messages(
      [
          ("system", """You are a helpful mobile information assistant.
          Please provide a brief overview of the specified mobile model, including key features, specifications (like RAM and ROM combinations),cameras,display, available colors, and unique selling points.
          Respond in 100 to 120 words, ensuring clarity and relevance in one line.
          Avoid mentioning any related models.
          """),
          ("user", "Question: {question}")
      ]
  )

In [None]:
llm = ChatGroq(model="Gemma2-9b-It",groq_api_key=api_key)
output_parser=StrOutputParser()
chain=mobile_prompt|llm|output_parser

In [None]:
# model = 'iphone 14'
# print(chain.invoke({"question": model}).strip())


In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('/content/cleaned_data.csv')

In [None]:
# df.columns

In [None]:
# df.head()

In [None]:
# Filter for the category 'smartphones' and create a new DataFrame with unique models
unique_df = df[df['category'] == 'Smartphones'][['model']].drop_duplicates().reset_index(drop=True)

print(unique_df)

In [None]:
# Function to get model info
def get_model_info(model):
    return chain.invoke({"question": model}).strip()

In [None]:
# Apply the function to create a new column 'model_info'
unique_df['model_info'] = unique_df['model'].apply(get_model_info)

print(unique_df)

In [None]:
# Add a new column to the original DataFrame based on model matching
df = df.merge(unique_df, on='model', how='left', suffixes=('', '_unique'))
df.head()

**2. Laptop Prompt**

In [None]:
laptop_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are a helpful laptop information assistant.
        Please provide a brief overview of the specified laptop model, including key features, specifications (such as processor, RAM, and storage), display details, battery life, and available colors.
        Respond in 100 to 120 words, ensuring clarity and relevance in one line.
        Avoid mentioning any related models.
        """),
        ("user", "Question: {question}")
    ]
)


In [None]:
llm = ChatGroq(model="Gemma2-9b-It",groq_api_key=api_key)
output_parser=StrOutputParser()
chain=laptop_prompt|llm|output_parser

In [None]:
# Filter for the category 'smartphones' and create a new DataFrame with unique models
unique_df = df[df['category'] == 'Laptop'][['model']].drop_duplicates().reset_index(drop=True)
unique_df

In [None]:
# Add a new column to the original DataFrame based on model matching
df = df.merge(unique_df, on='model', how='left', suffixes=('', '_unique'))
df.head()

**3. Airbuds Prompt**

In [None]:
earbuds_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are a helpful earbuds information assistant.
        Please provide a brief overview of the specified earbuds model, including key features, specifications (such as battery life, connectivity options, and sound quality), noise cancellation capabilities, and available colors.
        Respond in 100 to 120 words, ensuring clarity and relevance in one line.
        Avoid mentioning any related models.
        """),
        ("user", "Question: {question}")
    ]
)

In [None]:
llm = ChatGroq(model="Gemma2-9b-It",groq_api_key=api_key)
output_parser=StrOutputParser()
chain=earbuds_prompt|llm|output_parser

In [None]:
# Filter for the category 'smartphones' and create a new DataFrame with unique models
unique_df = df[df['category'] == 'Airbuds'][['model']].drop_duplicates().reset_index(drop=True)
unique_df

In [None]:
# Add a new column to the original DataFrame based on model matching
df = df.merge(unique_df, on='model', how='left', suffixes=('', '_unique'))
df.head()

**4. Smartwatch Prompt**

In [None]:
smartwatch_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are a helpful smartwatch information assistant.
        Please provide a brief overview of the specified smartwatch model, including key features, specifications (such as battery life, display type, and health tracking capabilities), compatibility with devices, available apps, and design options.
        Respond in 100 to 120 words, ensuring clarity and relevance in one line.
        Avoid mentioning any related models.
        """),
        ("user", "Question: {question}")
    ]
)

In [None]:
llm = ChatGroq(model="Gemma2-9b-It",groq_api_key=api_key)
output_parser=StrOutputParser()
chain=smartwatch_prompt|llm|output_parser

In [None]:
# Filter for the category 'smartphones' and create a new DataFrame with unique models
unique_df = df[df['category'] == 'Smartwatches'][['model']].drop_duplicates().reset_index(drop=True)
unique_df

In [None]:
# Add a new column to the original DataFrame based on model matching
df = df.merge(unique_df, on='model', how='left', suffixes=('', '_unique'))
df.head()

# **5. Make Final CSV**

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Rename columns
df = df.rename(columns={
    'combined': 'review_text_combined',
    'majority_sentiment': 'sentiment'
})

In [None]:
df.columns

In [None]:
# Select the required columns
final_df = df[['category', 'brand', 'model','model_info','review_text_combined','review_stars','sentiment', 'summary']]

final_df.head()

In [None]:
final_df.to_csv('final_data.csv', index=False)