In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_amazon(search_query, max_results):
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.0.0 Safari/537.36",
    }

    products_count = 0
    page_num = 1

    with open('amazon_products.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Product Name', 'Image URL', 'Product URL', 'Price', 'Rating', 'Rating Count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        while products_count < max_results:
            base_url = f"https://www.amazon.in/s?k={search_query.replace(' ', '+')}&page={page_num}&ref=nb_sb_noss"
            current_page = base_url
            print(current_page)
            response = requests.get(current_page, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                products = soup.find_all('div', {'data-component-type': 's-search-result'})

                for product in products:
                    product_name = product.find('span', {'class': 'a-text-normal'}).text.strip()

                    image_tag = product.find('img', {'class': 's-image'})
                    image_url = image_tag['src'] if image_tag else None

                    product_url = 'https://www.amazon.in' + product.find('a', {'class': 'a-link-normal'})['href']

                    price = product.find('span', {'class': 'a-price-whole'})
                    price = price.text.strip() if price else None

                    rating = product.find('span', {'class': 'a-icon-alt'})
                    rating = rating.text.split()[0] if rating else None

                    rating_count = product.find('span', {'class': 'a-size-base'})
                    rating_count = rating_count.text if rating_count else None

                    writer.writerow({
                        'Product Name': product_name,
                        'Image URL': image_url,
                        'Product URL': product_url,
                        'Price': price,
                        'Rating': rating,
                        'Rating Count': rating_count
                    })
                    products_count += 1
                    if products_count >= max_results:
                        break

                next_page_link = base_url

                if not next_page_link:
                    print("No more pages available.")
                    break

                current_page = next_page_link
                print(f"Scraping next page: {current_page}")
                time.sleep(20)  # Adding a delay to avoid overwhelming the server
            else:
                print('Failed to fetch the page.')
                break

            page_num += 1

    print(f'Scraping completed. Collected {products_count} results. Check amazon_products.csv file for results.')

# Replace 'your search keyword' with the search term you want to use
search_term = 'Galaxy S23'
max_results_to_collect = 20  # Set the maximum number of results to collect
scrape_amazon(search_term, max_results_to_collect)





https://www.amazon.in/s?k=Galaxy+S23&page=1&ref=nb_sb_noss
Scraping next page: https://www.amazon.in/s?k=Galaxy+S23&page=1&ref=nb_sb_noss
https://www.amazon.in/s?k=Galaxy+S23&page=2&ref=nb_sb_noss
Scraping next page: https://www.amazon.in/s?k=Galaxy+S23&page=2&ref=nb_sb_noss
Scraping completed. Collected 20 results. Check amazon_products.csv file for results.


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import time

def scrape_reviews_to_csv(product_url, row):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4472.130 Safari/537.39'
    }

    response = requests.get(product_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        product_title = soup.find('span', {'id': 'productTitle'})
        if product_title:
            product_title = product_title.get_text(strip=True)

            reviews = []
            review_blocks = soup.find_all('div', {'data-hook': 'review'})
            for review_block in review_blocks:
                user_name = review_block.find('span', {'class': 'a-profile-name'})
                user_name = user_name.get_text(strip=True) if user_name else None

                review_date = review_block.find('span', {'data-hook': 'review-date'})
                review_date = review_date.get_text(strip=True) if review_date else None

                review_title = review_block.find('a', {'data-hook': 'review-title'})
                review_title = review_title.get_text(strip=True) if review_title else None

                review_content = review_block.find('span', {'data-hook': 'review-body'})
                review_content = review_content.get_text(strip=True) if review_content else None

                rating = review_block.find('i', {'data-hook': 'review-star-rating'})
                rating = rating.get_text(strip=True) if rating else None

                # Assume review_size is the length of the review content
                review_size = len(review_content) if review_content else None

                reviews.append({
                    'Product URL': product_url,
                    'User name': user_name,
                    'Rating': rating,
                    'Review_date': review_date,
                    'Review_title': review_title,
                    'Review_content': review_content,
                    'Review_size': review_size
                })

            # Create a CSV file for each product
            csv_filename = f"{product_title.replace(' ', '_')}_reviews.csv"
            with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['Product URL', 'User name', 'Rating', 'Review_date', 'Review_title', 'Review_content', 'Review_size']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(reviews)
            time.sleep(20)
    else:
        print(f"Failed to fetch reviews for {product_url}")

# Read the main CSV file with product URLs
data = pd.read_csv('amazon_products.csv')

# Loop through each product URL and scrape reviews to create separate CSV files
for index, row in data.iterrows():
    print(index)
    product_url = row['Product URL']
    print(product_url)
    scrape_reviews_to_csv(product_url, row)



0
https://www.amazon.in/Samsung-Galaxy-Phantom-Black-Storage/dp/B0BY8PRH1Y/ref=ice_ac_b_dpb?keywords=Galaxy+S23&qid=1702290066&sr=8-1
1
https://www.amazon.in/Samsung-Galaxy-Cream-256GB-Storage/dp/B0BTYVTMT6/ref=sr_1_2?keywords=Galaxy+S23&qid=1702290066&sr=8-2
2
https://www.amazon.in/Samsung-Galaxy-Lavender-256GB-Storage/dp/B0BRSL2XWP/ref=sr_1_3?keywords=Galaxy+S23&qid=1702290066&sr=8-3
3
https://www.amazon.in/Samsung-Galaxy-Cream-128GB-Storage/dp/B0BT9F9SJJ/ref=sr_1_4?keywords=Galaxy+S23&qid=1702290066&sr=8-4
4
https://www.amazon.in/Samsung-Galaxy-Green-256GB-Storage/dp/B0BTYX1RP4/ref=sr_1_5?keywords=Galaxy+S23&qid=1702290066&sr=8-5
5
https://www.amazon.in/Samsung-Galaxy-Green-256GB-Storage/dp/B0BT9DVZLZ/ref=sr_1_6?keywords=Galaxy+S23&qid=1702290066&sr=8-6
6
https://www.amazon.in/SAMSUNG-Galaxy-S23-Graphite-Storage/dp/B0CJXQX3MB/ref=sr_1_7?keywords=Galaxy+S23&qid=1702290066&sr=8-7
7
https://www.amazon.in/Samsung-Galaxy-Ultra-Green-Storage/dp/B0BTYWFXKC/ref=sr_1_8?keywords=Galaxy+S23&qi

In [None]:
import pandas as pd
import os

# Create an empty DataFrame to store all reviews
all_reviews = pd.DataFrame()

# Directory where the individual review CSV files are stored
reviews_directory = '/content'  # Replace with the directory containing individual CSV files

# Loop through each file in the directory
for filename in os.listdir(reviews_directory):
    if filename.endswith("_reviews.csv"):
        # Read individual CSV file
        file_path = os.path.join(reviews_directory, filename)
        print(file_path)
        df = pd.read_csv(file_path)

        # Extract product name from the file name
        product_name = filename.split('_reviews.csv')[0].replace('_', ' ')

        # Add 'Product Name' column to the DataFrame
        df['Product Name'] = product_name

        # Append reviews to the combined DataFrame
        all_reviews = all_reviews.append(df, ignore_index=True)

# Save combined reviews to a new CSV file
all_reviews.to_csv('combined.csv', index=False)


/content/Samsung_Galaxy_S23_5G_(Green,_8GB_Ram,_256GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_FE_5G_(Purple,_8GB,_128GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_FE_5G_(Graphite,_8GB,_256GB_Storage)_reviews.csv
/content/SAMSUNG_Galaxy_S23_FE_5G_(Mint_256_GB_Storage)_(8_GB_RAM)_reviews.csv
/content/Samsung_Galaxy_S23_5G_(Cream,_8GB,_128GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_5G_(Green,_8GB,_128GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_5G_(Green,_8GB,_256GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_Ultra_5G_(Phantom_Black,_12GB,_512GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_5G_(Cream,_8GB_Ram,_256GB_Storage)_reviews.csv
/content/SAMSUNG_Galaxy_S23_Ultra_5G_(Green,_12GB_RAM,_512GB_Storage)_reviews.csv
/content/Samsung_Galaxy_S23_FE_5G_(Graphite,_8GB,_128GB_Storage)_reviews.csv
/content/SAMSUNG_Galaxy_S23_FE_5G_(Graphite_128_GB_Storage)_(8_GB_RAM)_reviews.csv
/content/Samsung_Galaxy_S23_5G_(Lavender,_8GB,_256GB_Storage)_reviews.csv
/con

  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews = all_reviews.append(df, ignore_index=True)
  all_reviews 

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Read the combined reviews CSV file
reviews_data = pd.read_csv('combined.csv')

# Function for data preprocessing
def preprocess_review(row):
    sentence = row['Review_content']
    product_name = row['Product Name']

    # Tokenization
    tokens = word_tokenize(sentence)

    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # POS Tagging
    pos_tags = pos_tag(filtered_tokens)

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

    return {
        'Product Name': product_name,
        'Original Sentence': sentence,
        'Tokens': tokens,
        'Filtered Tokens (after removing stop words)': filtered_tokens,
        'POS Tags': pos_tags,
        'Stemmed Words': stemmed_words
    }

# Apply preprocessing function to each review
processed_reviews = reviews_data.apply(preprocess_review, axis=1)

# Convert list of dictionaries to a DataFrame
processed_reviews_df = pd.DataFrame(processed_reviews.tolist())

# Reorder columns for better readability
processed_reviews_df = processed_reviews_df[['Product Name', 'Original Sentence', 'Tokens', 'Filtered Tokens (after removing stop words)', 'POS Tags', 'Stemmed Words']]

# Save processed reviews to a new CSV file
processed_reviews_df.to_csv('processed.csv', index=False)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
import pandas as pd
import nltk

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read the combined reviews CSV file
reviews_data = pd.read_csv('combined.csv')

# Function for negative polarity identification
def identify_negative_polarities(sentence):
    # Tokenize sentence
    tokens = nltk.word_tokenize(sentence)

    # Get POS tags for the sentence
    tagged_words = nltk.pos_tag(tokens)

    negative_prefixes = ['not', 'no', 'never', 'none', 'nobody', 'nowhere', 'nothing', 'neither', 'nor', 'hardly',
                         'scarcely', 'barely']

    NOA_phrases = []
    NOV_phrases = []

    for i in range(len(tagged_words) - 1):
        word, tag = tagged_words[i]
        next_word, next_tag = tagged_words[i + 1] if i + 1 < len(tagged_words) else (None, None)

        if word.lower() in negative_prefixes:
            if next_tag in ['JJ', 'JJR', 'JJS']:  # Adjective tags
                NOA_phrases.append((word, next_word))
            elif next_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:  # Verb tags
                NOV_phrases.append((word, next_word))

            if next_word and next_tag in ['JJ', 'JJR', 'JJS']:
                third_word, third_tag = tagged_words[i + 2] if i + 2 < len(tagged_words) else (None, None)
                if third_tag in ['JJ', 'JJR', 'JJS']:
                    NOA_phrases.append((word, next_word, third_word))

            if next_word and next_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                third_word, third_tag = tagged_words[i + 2] if i + 2 < len(tagged_words) else (None, None)
                if third_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                    NOV_phrases.append((word, next_word, third_word))

    return NOA_phrases, NOV_phrases

# Apply negative polarity identification to each review
reviews_data['NOA_Phrases'] = reviews_data['Review_content'].apply(lambda x: identify_negative_polarities(str(x))[0])
reviews_data['NOV_Phrases'] = reviews_data['Review_content'].apply(lambda x: identify_negative_polarities(str(x))[1])

# Save the updated reviews to a new CSV file
reviews_data.to_csv('reviews_with_negative_polarities.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import pandas as pd
import nltk

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read the combined reviews CSV file
reviews_data = pd.read_csv('combined.csv')

# Function for negative polarity identification
def identify_negative_polarities(row):
    sentence = str(row['Review_content'])
    product_name = row['Product Name']

    # Tokenize sentence
    tokens = nltk.word_tokenize(sentence)

    # Get POS tags for the sentence
    tagged_words = nltk.pos_tag(tokens)

    negative_prefixes = ['not', 'no', 'never', 'none', 'nobody', 'nowhere', 'nothing', 'neither', 'nor', 'hardly',
                         'scarcely', 'barely']

    NOA_phrases = []
    NOV_phrases = []

    for i in range(len(tagged_words) - 1):
        word, tag = tagged_words[i]
        next_word, next_tag = tagged_words[i + 1] if i + 1 < len(tagged_words) else (None, None)

        if word.lower() in negative_prefixes:
            if next_tag in ['JJ', 'JJR', 'JJS']:  # Adjective tags
                NOA_phrases.append((word, next_word, product_name))
            elif next_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:  # Verb tags
                NOV_phrases.append((word, next_word, product_name))

            if next_word and next_tag in ['JJ', 'JJR', 'JJS']:
                third_word, third_tag = tagged_words[i + 2] if i + 2 < len(tagged_words) else (None, None)
                if third_tag in ['JJ', 'JJR', 'JJS']:
                    NOA_phrases.append((word, next_word, third_word, product_name))

            if next_word and next_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                third_word, third_tag = tagged_words[i + 2] if i + 2 < len(tagged_words) else (None, None)
                if third_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                    NOV_phrases.append((word, next_word, third_word, product_name))

    return NOA_phrases, NOV_phrases

# Apply negative polarity identification to each review
reviews_data[['NOA_Phrases', 'NOV_Phrases']] = reviews_data.apply(identify_negative_polarities, axis=1, result_type='expand')

# Save the updated reviews to a new CSV file
reviews_data.to_csv('reviews_with_negative_polarities.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import pandas as pd

# Load your dataset
# Replace 'your_dataset.csv' with the path to your dataset
dataset = pd.read_csv('reviews_with_negative_polarities.csv')

# Assuming your dataset has a 'Rating' column containing ratings in the format '4.0 out of 5 stars'

# Function to extract numeric value from the 'Rating' column
def extract_numeric_rating(rating):
    try:
        return float(rating.split()[0])  # Extract the first part before the space and convert to float
    except:
        return None  # Return None for cases where the format doesn't match

# Apply the function to create a new 'Numeric_Rating' column
dataset['Numeric_Rating'] = dataset['Rating'].apply(extract_numeric_rating)
dataset.to_csv('reviews_with_negative_polarities.csv', index=False)
# Display the updated dataset
print(dataset[['Rating', 'Numeric_Rating']])


                 Rating  Numeric_Rating
0    5.0 out of 5 stars             5.0
1    5.0 out of 5 stars             5.0
2    4.0 out of 5 stars             4.0
3    5.0 out of 5 stars             5.0
4    4.0 out of 5 stars             4.0
..                  ...             ...
110                 NaN             NaN
111  4.0 out of 5 stars             4.0
112  1.0 out of 5 stars             1.0
113  4.0 out of 5 stars             4.0
114  1.0 out of 5 stars             1.0

[115 rows x 2 columns]


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.tokenize import word_tokenize
import nltk
import math
nltk.download('punkt')

# Load your preprocessed dataset for sentiment analysis
# Replace 'your_dataset.csv' with the path to your preprocessed dataset
dataset = pd.read_csv('/content/reviews_with_negative_polarities.csv')

# Assuming your dataset has 'Review_content' and 'Rating' columns
# Adjust column names accordingly if they differ in your dataset

# Define a function to categorize sentiment based on rating
def categorize_sentiment(rating):
    if math.isnan(rating):
        return 'Neutral'  # Return Neutral if rating is NaN

    rating_value = int(rating)
    if rating_value > 3:
        return 'Positive'
    elif rating_value < 3:
        return 'Negative'
    else:
        return 'Neutral'  # Rating of 3 will also be considered as Neutral


# Create 'Sentiment' column based on 'Rating'
dataset['Sentiment'] = dataset['Numeric_Rating'].apply(categorize_sentiment)

# Tokenizing and vectorizing the text data
tfidf = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english')
X = tfidf.fit_transform(dataset['Review_content'])
y = dataset['Sentiment']  # 'Sentiment' column is now the target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear')

# Train the SVM model
svm_classifier.fit(X_train, y_train)

# Evaluate the model
accuracy = svm_classifier.score(X_test, y_test)
print(f"Accuracy of the SVM model: {accuracy}")
dataset.to_csv('/content/reviews_with_negative_polarities.csv', index=False)
# Serialize the trained model
import pickle
# Replace 'svm_model.pkl' with the desired file path and name to save the trained model
with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(svm_classifier, model_file)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy of the SVM model: 0.9130434782608695


In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

import pandas as pd



# Define a function to perform sentiment analysis for products containing specific keywords in the product name
def find_best_product_by_keyword(keyword):
    # Filter the dataset for product names containing the specified keyword
    filtered_data = dataset[dataset['Product Name'].str.contains(keyword, case=False, na=False)]

    if filtered_data.empty:
        print(f"No products found containing the keyword: '{keyword}'")
        return

    # Predict sentiments for the filtered products
    X_filtered = tfidf.transform(filtered_data['Review_content'])  # Assuming 'Review_content' is used for prediction
    y_filtered_predicted = svm_classifier.predict(X_filtered)  # Predict sentiments for the filtered products

    # Add predicted sentiments to the filtered dataset
    filtered_data['Predicted_Sentiment'] = y_filtered_predicted

    # Aggregate sentiments for the filtered products
    sentiment_counts = filtered_data['Predicted_Sentiment'].value_counts()

    print(f"Sentiment analysis results for products containing the keyword '{keyword}':")
    print(sentiment_counts)

    # Calculate the total number of positive sentiments for each product containing the keyword
    filtered_sentiment_counts = filtered_data.groupby('Product Name')['Predicted_Sentiment'].value_counts().unstack().fillna(0)
    filtered_sentiment_counts['Total_Positive'] = filtered_sentiment_counts['Positive']

    # Identify the product with the most positive sentiments among those containing the keyword
    best_product_filtered = filtered_sentiment_counts['Total_Positive'].idxmax()
    #best_product_filtered = "Samsung "+ best_product_filtered
    best_product_name = best_product_filtered
    best_product_url = filtered_data.loc[filtered_data['Product Name'] == best_product_name, 'Product URL'].iloc[0]
    if pd.notnull(best_product_url):
        print(f"The URL for the best product '{best_product_name}' is: {best_product_url}")
    else:
        print(f"URL not found for the best product '{best_product_name}'")

    print(f"The best product among those containing '{keyword}' is: {best_product_name}")

    #print(f"Product URL: {best_product_url}")
    #print(filtered_sentiment_counts['Total_Positive'])

# Search for specific keywords and find the best product among products containing those keywords
searched_keyword = 'Galaxy S23'  # Replace with the keyword you want to search for in product names
find_best_product_by_keyword(searched_keyword)




Sentiment analysis results for products containing the keyword 'Galaxy S23':
Positive    72
Negative    24
Neutral     19
Name: Predicted_Sentiment, dtype: int64
The URL for the best product 'Samsung Galaxy S23 Ultra 5G (Phantom Black, 12GB, 256GB Storage)' is: https://www.amazon.in/Samsung-Galaxy-Ultra-Phantom-Storage/dp/B0BTWQZBGP/ref=sr_1_10?keywords=Galaxy+S23&qid=1702290066&sr=8-10
The best product among those containing 'Galaxy S23' is: Samsung Galaxy S23 Ultra 5G (Phantom Black, 12GB, 256GB Storage)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Assuming you have trained a model 'svm_classifier' and have your test set 'X_test'

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Now, you can compute various evaluation metrics using the predicted values 'y_pred' and the true labels 'y_test'

# Compute the accuracy
acc = accuracy_score(y_test, y_pred)

# Compute the precision (micro, macro, weighted or None based on your requirement)
pre_micro = precision_score(y_test, y_pred, average='micro')
pre_macro = precision_score(y_test, y_pred, average='macro')
pre_weighted = precision_score(y_test, y_pred, average='weighted')
pre_none = precision_score(y_test, y_pred, average=None)  # For multiclass, returns precision for each class

# Compute the recall (micro, macro, weighted or None based on your requirement)
rec_micro = recall_score(y_test, y_pred, average='micro')
rec_macro = recall_score(y_test, y_pred, average='macro')
rec_weighted = recall_score(y_test, y_pred, average='weighted')
rec_none = recall_score(y_test, y_pred, average=None)  # For multiclass, returns recall for each class

# Compute the F1 score (micro, macro, weighted or None based on your requirement)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_none = f1_score(y_test, y_pred, average=None)  # For multiclass, returns F1 score for each class

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)



In [None]:
print("Accuracy:", acc)

print("Precision (Micro):", pre_micro)
print("Precision (Macro):", pre_macro)
print("Precision (Weighted):", pre_weighted)
print("Precision (None):", pre_none)

print("Recall (Micro):", rec_micro)
print("Recall (Macro):", rec_macro)
print("Recall (Weighted):", rec_weighted)
print("Recall (None):", rec_none)

print("F1 Score (Micro):", f1_micro)
print("F1 Score (Macro):", f1_macro)
print("F1 Score (Weighted):", f1_weighted)
print("F1 Score (None):", f1_none)

print("Confusion Matrix:\n", cm)


Accuracy: 0.9130434782608695
Precision (Micro): 0.9130434782608695
Precision (Macro): 0.9607843137254902
Precision (Weighted): 0.9232736572890026
Precision (None): [1.         1.         0.88235294]
Recall (Micro): 0.9130434782608695
Recall (Macro): 0.7777777777777778
Recall (Weighted): 0.9130434782608695
Recall (None): [0.83333333 0.5        1.        ]
F1 Score (Micro): 0.9130434782608695
F1 Score (Macro): 0.8377525252525252
F1 Score (Weighted): 0.906538208168643
F1 Score (None): [0.90909091 0.66666667 0.9375    ]
Confusion Matrix:
 [[ 5  0  1]
 [ 0  1  1]
 [ 0  0 15]]


### Result Analysis

#### Accuracy
The model achieved an accuracy of approximately 91.3%, signifying the overall correctness in classifying sentiments across all classes.

#### Precision
- **Micro-average Precision:** It represents the precision calculated globally by considering the total true positive, false positive, and false negative values across all classes. In this case, the micro-precision is also 91.3%.
- **Macro-average Precision:** It computes the average precision for each class without considering class imbalance. The macro-precision is approximately 96.1%, suggesting good precision across classes.
- **Weighted-average Precision:** It calculates the precision for each class, considering the number of samples in each class. The weighted precision here is around 92.3%.
- **Class-specific Precision:** It indicates precision values for each class separately. The classes have precision values of 100%, 100%, and approximately 88.2%, respectively.

#### Recall
- **Micro-average Recall:** It denotes the recall calculated globally, considering total true positive, false positive, and false negative values across all classes. The micro-recall achieved is 91.3%.
- **Macro-average Recall:** It computes the average recall for each class without considering class imbalance. The macro-recall is approximately 77.8%, indicating some variability in class-specific recall values.
- **Weighted-average Recall:** It calculates the recall for each class, considering the number of samples in each class. The weighted recall here is 91.3%.
- **Class-specific Recall:** The recall values for classes are approximately 83.3%, 50%, and 100%, respectively.

#### F1 Score
- **Micro-average F1 Score:** It represents the harmonic mean of precision and recall calculated globally across all classes. The micro-F1 score achieved is 91.3%.
- **Macro-average F1 Score:** It calculates the average F1 score for each class without considering class imbalance. The macro-F1 score is approximately 83.8%.
- **Weighted-average F1 Score:** It computes the F1 score for each class, considering the number of samples in each class. The weighted F1 score here is around 90.7%.
- **Class-specific F1 Score:** The F1 score values for each class are approximately 90.9%, 66.7%, and 93.8%, respectively.

#### Confusion Matrix
The confusion matrix displays the model's classification results. It indicates that:
- Class 0: 5 samples correctly predicted, 0 samples incorrectly predicted as other classes, and 1 sample misclassified.
- Class 1: 1 sample correctly predicted for this class, 0 samples incorrectly predicted as other classes, and 1 sample misclassified.
- Class 2: 15 samples correctly predicted for this class, with no misclassifications.

### Summary
The model demonstrates high accuracy, especially in correctly classifying samples for Class 2, while exhibiting some misclassifications for Classes 0 and 1. Further analysis and potentially fine-tuning the model could be beneficial to address misclassifications and improve performance, especially for Classes 0 and 1.