In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

df = pd.read_csv("../amz_uk_price_prediction_dataset.csv")
df.head()

In [None]:
# Part 1: Analyzing Best-Seller Trends Across Product Categories
# Objective: Understand the relationship between product categories and their best-seller status.

# 1. Crosstab Analysis:

# Create a crosstab between the product category and the isBestSeller status.
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
display(crosstab_result)

#Are there categories where being a best-seller is more prevalent?
#Hint: one option is to calculate the proportion of best-sellers for each category and then sort the categories based on this proportion in descending order.
crosstab_result = crosstab_result.reset_index()
crosstab_result = crosstab_result.sort_values(by= True, ascending=False)
display(crosstab_result.head(5))

crosstab_result['Percentage'] = crosstab_result[True] / (crosstab_result[False] + crosstab_result[True])
crosstab_result = crosstab_result.sort_values(by='Percentage', ascending=False)
display(crosstab_result.head(5))

print("The categories with highest amount of Bestseller articles are 'Grocery' and 'Health & Personal Care'. The categories with highest share of Bestseller articles are 'Grocery', 'Smart Home Security & Lighting' and 'Health & Personal Care'.")


In [None]:
# 2. Statistical Tests:

#Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])

from scipy.stats import chi2_contingency

chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
display(chi2_statistic, chi2_p_value)

print("I am surprised that the result is 0 but if this is correct, this indicates that there is a relationship between category and articles being Bestsellers.")

#Compute Cramér's V to understand the strength of association between best-seller status and category.
from scipy.stats.contingency import association

display(association(crosstab_result, method="cramer"))

print("The result of 0.122 indicates a very weak association between best-seller status and category.")


In [None]:
# 3. Visualizations:
# Visualize the relationship between product categories and the best-seller status using a stacked bar chart.

crosstab_result['Percentage'] = crosstab_result[True] / (crosstab_result[False] + crosstab_result[True])
crosstab_result = crosstab_result.sort_values(by='Percentage', ascending=False)
crosstab_result = crosstab_result.head(20)

crosstab_result.plot(kind="bar", stacked=True)
plt.show()

In [None]:
# Part 2: Exploring Product Prices and Ratings Across Categories and Brands
# Objective: Investigate how different product categories influence product prices.

# 0. Preliminary Step: Remove outliers in product prices.
# For this purpose, we can use the IQR (Interquartile Range) method. Products priced below the first quartile minus 1.5 times the IQR or above the third quartile plus 1.5 times the IQR will be considered outliers and removed from the dataset. The next steps will be done with the dataframe without outliers.
# Hint: you can check the last Check For Understanding at the end of the lesson EDA Bivariate Analysis for a hint on how to do this.
import pandas as pd

def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Create a boolean mask for outliers
    is_outlier = (data < lower_bound) | (data > upper_bound)

    return is_outlier

# Assuming df is your DataFrame with a column 'price'
data_series = df['price']
outliers_mask = tukeys_test_outliers(data_series)
print("Outliers:")
print(data_series[outliers_mask])

# Filter out the outliers by using the negation of the mask
df_without_outliers = df[~outliers_mask]

# Display the DataFrame without outliers
print(df_without_outliers)


In [None]:
# 1. Violin Plots:

# Use a violin plot to visualize the distribution of price across different product categories. Filter out the top 20 categories based on count for better visualization.
category_counts = df['category'].value_counts().sort_values(ascending=False)
category_counts = category_counts.head(20)
df_count = df_without_outliers.set_index('category').loc[category_counts.index].reset_index()
df_count.category.unique()

sns.violinplot(data=df_count, x='category', y='price', palette="coolwarm", hue='category', legend=False)
plt.xticks(rotation=45, ha='right')
plt.show()

# Which product category tends to have the highest median price? Don't filter here by top categories.
display((df_without_outliers.groupby("category")["price"].median()).sort_values(ascending=False))
print("The category 'Desktop PCs' has the highest median price of 74.00 GBP.")


In [None]:
# 2. Bar Charts:

# Create a bar chart comparing the average price of products for the top 10 product categories (based on count).
category_counts2 = df['category'].value_counts().sort_values(ascending=False)
category_counts2 = category_counts.head(10)
df_count10 = df_without_outliers.set_index('category').loc[category_counts2.index].reset_index()
df_count10.category.unique()

average_price_df = df_count10.groupby("category")["price"].mean().sort_values(ascending = False).reset_index()

sns.barplot(data=average_price_df, x="category", y="price", palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.xlabel("Category")
plt.ylabel("Average Price")
plt.title("Average Price per Category")
plt.show()

# Which product category commands the highest average price? Don't filter here by top categories.
average_price_df2 = df_without_outliers.groupby("category")["price"].mean().sort_values(ascending = False).reset_index()
display(average_price_df2)
print("Highest average price has the category Motherboards with 68.77 GBP, followed by Boxing Shoes with 67.42 GBP and Desktop PCs with 66.92 GBP.")


In [None]:
# 3. Box Plots:

# Visualize the distribution of product ratings based on their category using side-by-side box plots. Filter out the top 10 categories based on count for better visualization.
sns.boxplot(data=df_count10, x='category', y='stars', palette="coolwarm", hue='category', legend=False)
plt.xticks(rotation=45, ha='right')
plt.show()

# Which category tends to receive the highest median rating from customers? Don't filter here by top categories.
median_rating = df_without_outliers.groupby("category")["stars"].median().sort_values(ascending = False).reset_index()
display(median_rating)
print("Highest median rating has category 'Computer Memory' with 4.7 stars.")


In [None]:
# Part 3: Investigating the Interplay Between Product Prices and Ratings
# Objective: Analyze how product ratings (stars) correlate with product prices.

# 1. Correlation Coefficients:

# Calculate the correlation coefficient between price and stars.
correlation = df['price'].corr(df['stars'])
display(correlation)

# Is there a significant correlation between product price and its rating?
print("There is a very weak negative correlation, which could mean that expensive products tend very slightly to receive not as high ratings as cheaper ones.")


In [None]:
# 2. Visualizations:

# Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?
sns.scatterplot(data=df_without_outliers, x='stars', y='price')
plt.show()
print("It rather looks like there are less ratings between 1 and 2 the higher the price.")

# Use a correlation heatmap to visualize correlations between all numerical variables.
df_rating_price = df_without_outliers[["stars", "price"]]
df_rating_price

correlation_matrix = df_rating_price.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap for rating and price")
plt.show()

# Examine if product prices typically follow a normal distribution using a QQ plot.
import statsmodels.api as sm
sm.qqplot(df_without_outliers['price'], line='s')
plt.show()
print(" There seems to be a heavier tail on the lower side than the normal distribution (lower prices).")
