In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from scipy.stats import chi2_contingency

%matplotlib inline

In [2]:
# Load the CSV file into a DataFrame
file_path = r"C:\Users\Admin\Downloads\dataset\amz_uk_price_prediction_dataset.csv"  # Use raw string by adding r in front of the path
# Load CSV into DataFrame
df = pd.read_csv(file_path)

In [3]:
df.shape

(2443651, 9)

In [4]:
df.columns

Index(['uid', 'asin', 'title', 'stars', 'reviews', 'price', 'isBestSeller',
       'boughtInLastMonth', 'category'],
      dtype='object')

In [5]:
df.dtypes

uid                    int64
asin                  object
title                 object
stars                float64
reviews                int64
price                float64
isBestSeller            bool
boughtInLastMonth      int64
category              object
dtype: object

In [24]:
### Part 1: Analyzing Best-Seller Trends Across Product Categories

##**Objective**: Understand the relationship between product categories and their best-seller status.

##1. **Crosstab Analysis**:
##    - Create a crosstab between the product `category` and the `isBestSeller` status.
crosstab = pd.crosstab(df['category'], df['isBestSeller'])    
##    - Are there categories where being a best-seller is more prevalent? 
crosstab['BestSeller_Proportion'] = crosstab[True] / (crosstab[True] + crosstab[False])    	
##    	*Hint: one option is to calculate the proportion of best-sellers for each category and then sort the categories based on this proportion in descending order.*
sorted_crosstab = crosstab.sort_values(by='BestSeller_Proportion', ascending=False)

print(sorted_crosstab)

##2. **Statistical Tests**:
##    - Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
chi2, p, dof, ex = chi2_contingency(crosstab.iloc[:, :-1])

print(chi2, p)
##    - Compute CramÃ©r's V to understand the strength of association between best-seller status and category.
n = crosstab.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(crosstab.shape) - 1)))

print(cramers_v)
##3. **Visualizations**:
##	- Visualize the relationship between product categories and the best-seller status using a stacked bar chart.
crosstab_plot = crosstab.copy()
crosstab_plot['Total'] = crosstab_plot[True] + crosstab_plot[False]

crosstab_plot_norm = crosstab_plot.div(crosstab_plot['Total'], axis=0)

isBestSeller                      False  True  BestSeller_Proportion
category                                                            
Grocery                            9008   556               0.058135
Smart Home Security & Lighting       98     6               0.057692
Health & Personal Care             9017   552               0.057686
Mobile Phone Accessories            248    11               0.042471
Power & Hand Tools                 8353   306               0.035339
...                                 ...   ...                    ...
CD, Disc & Tape Players            8798     0               0.000000
General Music-Making Accessories    259     0               0.000000
Snowboard Boots                    1449     0               0.000000
Boxes & Organisers                  911     0               0.000000
eBook Readers & Accessories         246     0               0.000000

[296 rows x 3 columns]
36540.20270061387 0.0
0.08646706664522864


In [27]:
# Plot the normalized crosstab
crosstab_plot_norm[[False, True]].plot(kind='bar', stacked=True, figsize=(12, 8), color=['skyblue', 'orange'])
plt.xlabel('Product Category')
plt.ylabel('Proportion')
plt.title('Proportion of Best-Seller Status Across Product Categories')
plt.legend(['Not Best Seller', 'Best Seller'], loc='upper right')
plt.xticks(rotation=45)
plt.show()

sorted_crosstab, chi2, p, cramers_v

ValueError: Item wrong length 2 instead of 296.

In [14]:
# Plot the normalized crosstab
crosstab_plot_norm[[False, True]].plot(kind='bar', stacked=True, figsize=(12, 8), color=['skyblue', 'orange'])
plt.xlabel('Product Category')
plt.ylabel('Proportion')
plt.title('Proportion of Best-Seller Status Across Product Categories')
plt.legend(['Not Best Seller', 'Best Seller'], loc='upper right')
plt.xticks(rotation=45)
plt.show()

ValueError: Item wrong length 2 instead of 296.