#Amazon Sales Dataset

1. What is the average rating for each product category?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Group by category, calculate average rating
avg_rating_by_category = df.groupby('category')['rating'].mean().reset_index()

# Print the results
print(avg_rating_by_category)

# Sort the results by average rating in descending order
avg_rating_by_category = avg_rating_by_category.sort_values(by='rating', ascending=False)

# Print the top categories by average rating
print(avg_rating_by_category.head())



2. What are the top rating_count products by category?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Group by category, calculate average rating count
avg_rating_count_by_category = df.groupby('category')['rating_count'].mean().reset_index()

# Sort the results by average rating count in descending order
avg_rating_count_by_category = avg_rating_count_by_category.sort_values(by='rating_count', ascending=False)

# Print the top categories by average rating count
print(avg_rating_count_by_category.head())


3.  What is the distribution of discounted prices vs. actual prices?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('amazon.csv')

# Calculate the discount amount
df['discount_amount'] = df['actual_price'] - df['discounted_price']

# Plot the distribution of discounted prices vs. actual prices
plt.figure(figsize=(10, 6))
plt.hist(df['actual_price'], alpha=0.5, label='Actual Price')
plt.hist(df['discounted_price'], alpha=0.5, label='Discounted Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Distribution of Discounted Prices vs. Actual Prices')
plt.legend()
plt.show()

# Plot the distribution of discount amounts
plt.figure(figsize=(10, 6))
plt.hist(df['discount_amount'], bins=50)
plt.xlabel('Discount Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Discount Amounts')
plt.show()


4.  How does the average discount percentage vary across categories?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('amazon.csv')

# Group by category, calculate average discount percentage
avg_discount_by_category = df.groupby('category')['discount_percentage'].mean().reset_index()

# Sort the results by average discount percentage in descending order
avg_discount_by_category = avg_discount_by_category.sort_values(by='discount_percentage', ascending=False)

# Print the results
print(avg_discount_by_category)

# Plot the average discount percentage by category
plt.figure(figsize=(10, 6))
plt.bar(avg_discount_by_category['category'], avg_discount_by_category['discount_percentage'])
plt.xlabel('Category')
plt.ylabel('Average Discount Percentage')
plt.title('Average Discount Percentage by Category')
plt.xticks(rotation=90)
plt.show()


5. What are the most popular product names?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Count the occurrences of each product name
product_name_counts = df['product_name'].value_counts()

# Print the top product names
print(product_name_counts.head())

# Get the top N product names
N = 10
top_product_names = product_name_counts.nlargest(N)
print(top_product_names)




6. What are the most popular product keywords?

In [None]:
import pandas as pd
from collections import Counter
import re

# Load the dataset
df = pd.read_csv('amazon.csv')

# Combine product names and descriptions into a single column
df['product_text'] = df['product_name'] + ' ' + df['about_product']

# Convert text to lowercase and remove special characters
df['product_text'] = df['product_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x).lower()))

# Split text into words
words = df['product_text'].str.split().sum()

# Count the occurrences of each word
word_counts = Counter(words)

# Print the top words
print(word_counts.most_common(10))



7. What are the most popular product reviews?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Calculate the review helpfulness score (assuming 'rating' and 'rating_count' columns)
df['helpfulness_score'] = df['rating'] * df['rating_count']

# Sort the reviews by helpfulness score in descending order
most_popular_reviews = df.sort_values(by='helpfulness_score', ascending=False)

# Print the top reviews
print(most_popular_reviews.head())

# Alternatively, you can use 'review_content' length as a proxy for popularity
df['review_length'] = df['review_content'].apply(len)
most_popular_reviews = df.sort_values(by='review_length', ascending=False)
print(most_popular_reviews.head())


8. What is the correlation between discounted_price and rating?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('amazon.csv')

# Calculate the correlation between discounted_price and rating
correlation = df['discounted_price'].corr(df['rating'])
print(f'Correlation: {correlation:.2f}')

# Visualize the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(x='discounted_price', y='rating', data=df)
plt.title('Relationship between Discounted Price and Rating')
plt.xlabel('Discounted Price')
plt.ylabel('Rating')
plt.show()



9.  What are the Top 5 categories based on the highest rating?

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Group by category, calculate average rating
avg_rating_by_category = df.groupby('category')['rating'].mean().reset_index()

# Sort the results by average rating in descending order
top_categories = avg_rating_by_category.sort_values(by='rating', ascending=False).head(5)

# Print the top categories
print(top_categories)


10.  Identify any potential areas for improvement or optimization based on the data analysis.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('amazon.csv')

# Analyze the data
# Calculate average rating by category
avg_rating_by_category = df.groupby('category')['rating'].mean().reset_index()

# Identify categories with low average ratings
low_rated_categories = avg_rating_by_category[avg_rating_by_category['rating'] < avg_rating_by_category['rating'].mean()]

# Print the results
print(low_rated_categories)

# Analyze the relationship between discount percentage and rating
correlation = df['discount_percentage'].corr(df['rating'])
print(f'Correlation between discount percentage and rating: {correlation:.2f}')

# Identify products with low ratings and high discount percentages
low_rated_products = df[(df['rating'] < df['rating'].mean()) & (df['discount_percentage'] > df['discount_percentage'].mean())]

# Print the results
print(low_rated_products)

