# **A. Import Libraries**

In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
from scipy import stats
import pandas as pd
import numpy as np
import requests
import time

# **B. Web Scraping**

In [5]:
driver = webdriver.Chrome()

# variable definition
list_nama_produk = []
list_harga_produk = []
list_penjual = []
list_kota = []
list_barang_terjual = []
list_rating = []

# Repeat the Tokopedia page, starting from pages 1-10
for i in range(1,11):
    
    # calling webdriver with link (url)
    url = f'https://www.tokopedia.com/search?navsource=&page={i}&q=seblak&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
    driver.get(url)

    # scroll vertically down the page 
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    boxes = soup.find_all('div', {'id':"" , 'class':"pcv3__container css-1izdl9e"})

    for box in boxes:
        # scraping item names
        try:
            nama_produk = box.find('div', {'class':"prd_link-product-name css-3um8ox"})
            list_nama_produk.append(nama_produk.get_text())
        except:
            list_nama_produk.append(None)
           
        # scraping product prices
        try:
            harga_produk = box.find('div', {'class':"prd_link-product-price css-h66vau"})
            list_harga_produk.append(harga_produk.get_text()) 
        except:
            list_harga_produk.append(None)      
            
        # scraping shop name
        try:
            penjual = box.find('span', {'class':"prd_link-shop-name css-1kdc32b flip"})
            list_penjual.append(penjual.get_text()) 
        except:
            list_penjual.append(None) 
           
        # scraping city
        try:
            kota_toko = box.find('span', {'class':"prd_link-shop-loc css-1kdc32b flip"})
            list_kota.append(kota_toko.get_text()) 
        except:
            list_kota.append(None) 
           
        # scraping of sold products
        try:
            terjual = box.find('span', {'class':"prd_label-integrity css-1sgek4h"})
            list_barang_terjual.append(terjual.get_text()) 
        except:
            list_barang_terjual.append(None) 
          
        # scraping product ratings
        try:
            rating = box.find('span', {'class':"prd_rating-average-text css-t70v7i"})
            list_rating.append(rating.get_text()) 
        except:
            list_rating.append(None) 
                
    # sleep for 3 seconds
    time.sleep(3)



In [6]:
# create dataframe
data = pd.DataFrame({'nama produk' : list_nama_produk,
                     'harga' : list_harga_produk,
                     'penjual' : list_penjual,
                     'kota' : list_kota,
                     'terjual' : list_barang_terjual,
                     'rating' : list_rating
                     })
data.head()

In [7]:
# create file .csv
data.to_csv("data_tokopedia.csv", index=False)

# **C. Data Loading**

In [28]:
# Load data
dataset = pd.read_csv("data_tokopedia.csv")
dataset.head()

Unnamed: 0,nama produk,harga,penjual,kota,terjual,rating
0,Kylafood Mix Fav Selebgram Seblak Original & B...,Rp32.500,,,40+ terjual,4.4
1,Kylafood Seblak Tulang Rawan,Rp24.500,,,8 rb+ terjual,4.9
2,Kylafood Seblak Mie Baso Sapi Play,Rp21.450,,,250+ terjual,4.9
3,Gelifood Seblak Instan Kerupuk Mawar Bumbu Ken...,Rp13.000,Lidigeli,Kab. Garut,250+ terjual,4.8
4,Seblak Instan Ceu Nthien Khas Bandung Rasana N...,Rp17.000,Central Seblak Nusantara,Tangerang Selatan,2rb+ terjual,4.9


Insight :  

Pada dataset dari produk seblak di Tokopedia terdapat kolom:
1. nama produk : nama produk yang dijual di Tokopedia
2. harga : harga dari produk seblak
3. penjual : nama toko yang menjual seblak
4. kota : kota dari toko yang menjual seblak
5. terjual : banyaknya produk terjual
6. rating : review para pelanggan terhadap produk

In [6]:
dataset.shape

(1081, 6)

In [4]:
# Melihat info data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1081 entries, 0 to 1080
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nama produk  1081 non-null   object 
 1   harga        1081 non-null   object 
 2   penjual      1055 non-null   object 
 3   kota         1055 non-null   object 
 4   terjual      1038 non-null   object 
 5   rating       1002 non-null   float64
dtypes: float64(1), object(5)
memory usage: 50.8+ KB


**Information:**  
In the dataset of seblak products on Tokopedia there are columns:
1. nama produk : name of the product sold on Tokopedia
2. harga : the price of the seblak product
3. penjual : name of the shop that sells seblak
4. kota : the city of the shop that sells seblak
5. terjual : the number of products sold
6. rating : customer reviews of the product

**Insight:**  
The seblak product dataset on Tokopedia has 1081 data. It has an object data type in the product name, price, seller, city and sold columns, and a float data type in the rating column.

# **D. Handling Missing Value**

### **missing value check**

In [29]:
dataset.isnull().any()

nama produk    False
harga          False
penjual         True
kota            True
terjual         True
rating          True
dtype: bool

In [30]:
dataset.isnull().sum()

nama produk     0
harga           0
penjual        26
kota           26
terjual        43
rating         79
dtype: int64

Insight :  

There are null values ​​in this dataset, especially in the columns penjual, kota, terjual dan rating  

### **cleaning data**

In [31]:
# copy dataset 
df = dataset.copy()

# Deleting null data
df = df.dropna()

In [32]:
# drop duplicated
drop = df.drop_duplicates(keep='first', inplace=True)

In [36]:
# clean value data
df['harga'] =  df['harga'].str.replace('.' , '' )
df['harga'] =  df['harga'].str.replace('Rp' , '' )
df['harga'].replace('rb' , '000' ,regex=True, inplace=True)

df['terjual'].replace('terjual' , '' ,regex=True, inplace=True)
df['terjual'].replace('\+' , '' ,regex=True, inplace=True)
df['terjual'].replace('rb' , '000' ,regex=True, inplace=True)
df['terjual'].replace(' ' , '' ,regex=True, inplace=True)


In [38]:
# change the data type 
df['harga'] = df['harga'].astype(int)
df['terjual'] = df['terjual'].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 649 entries, 3 to 1050
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nama produk  649 non-null    object 
 1   harga        649 non-null    int32  
 2   penjual      649 non-null    object 
 3   kota         649 non-null    object 
 4   terjual      649 non-null    int32  
 5   rating       649 non-null    float64
dtypes: float64(1), int32(2), object(3)
memory usage: 30.4+ KB


Insight :  
In this dataset, handling missing values, handling duplicated data, handling value data and changing data types have been carried out

# **E. Business Understanding**

### **SMART**

1. Specific: Increase income by becoming a dropshipper for seblak products on Tokopedia.
2. Measurable: Get income as a dropshipper of 7%.
3. Achievable: Retrieve data using web scrapping and analyze product sales.
4. Relevant: Cheap prices of goods can increase sales of goods. And if sales of goods increase, it will affect a good rating.
5. Time-bound: increased income can be seen in the next 1 month.

# **F. Statistic analysis**

###  **Mean, median, standard deviation, skewness, and kurtosis**

In [41]:
# calculate the average (mean) and median (quartile 2)
mean_harga = df.harga.mean()
median_harga = df.harga.quantile(0.5)

# calculate standard deviation (std), skewness (skew) and kurtosis (kurt)
std_harga = df.harga.std()
skew_harga = df.harga.skew()
kurtosis_harga = df.harga.kurt()

print(f'Mean with outliers is   : {mean_harga}')
print(f'Median with outliers is : {median_harga}')
print()
print(f'Standard Deviation with outliers is : {std_harga}')
print(f'Skewness with outliers is           : {skew_harga}')
print(f'Kurtosis with outliers is           : {kurtosis_harga}')


Mean with outliers is   : 23244.26194144838
Median with outliers is : 15000.0

Standard Deviation with outliers is : 25813.273793844386
Skewness with outliers is           : 3.271526104170177
Kurtosis with outliers is           : 14.212656397617856


Insight:  

The price column indicates that there are some very high values ​​that affect the average value. It was found that the data distribution was asymmetrical, which was skewed to the right (skewness), which had very skewness. It was also found that there was a very high level of sharpness in the data distribution (leptokurtic distribution). Judging from the standard deviation value, the distribution of the data is widely spread from the average, meaning the data varies

This shows that the price data for seblak products on Tokopedia has an asymmetrical distribution and contains extreme values ​​or a lot of outlier data.

In [42]:
mean_terjual = df.terjual.mean()
median_terjual = df.terjual.quantile(0.5)

std_terjual = df.terjual.std()
skew_terjual = df.terjual.skew()
kurtosis_terjual = df.terjual.kurt()

print(f'Mean with outliers is   : {mean_terjual}')
print(f'Median with outliers is : {median_terjual}')
print()
print(f'Standard Deviation with outliers is : {std_terjual}')
print(f'Skewness with outliers is           : {skew_terjual}')
print(f'Kurtosis with outliers is           : {kurtosis_terjual}')


Mean with outliers is   : 360.4191063174114
Median with outliers is : 70.0

Standard Deviation with outliers is : 1140.7160492177843
Skewness with outliers is           : 6.449114208839196
Kurtosis with outliers is           : 47.18210909106413


Insight:

The sold column indicates that there are some very high values ​​which affect the average value. It was found that the data distribution was asymmetrical, which was skewed to the right (skewness), which had very skewness. It was also found that there was a very high level of sharpness in the data distribution (leptokurtic distribution). Judging from the standard deviation value, the distribution of the data is widely spread from the average, meaning the data varies

This shows that the sales data for seblak products on Tokopedia has an asymmetrical distribution and contains extreme values ​​or a lot of outlier data.

In [43]:
# hitung rata-rata (mean) dan median(quartile 2)
mean_rating = df.rating.mean()
median_rating = df.rating.quantile(0.5)

# menghitung standar deviasi (std), skewness(skew) dan kurtosis(kurt)
std_rating = df.rating.std()
skew_rating = df.rating.skew()
kurtosis_rating = df.rating.kurt()

print(f'Mean with outliers is   : {mean_rating}')
print(f'Median with outliers is : {median_rating}')
print()
print(f'Standard Deviation with outliers is : {std_rating}')
print(f'Skewness with outliers is           : {skew_rating}')
print(f'Kurtosis with outliers is           : {kurtosis_rating}')


Mean with outliers is   : 4.871494607087827
Median with outliers is : 4.9

Standard Deviation with outliers is : 0.2021590645845767
Skewness with outliers is           : -5.9456674540171734
Kurtosis with outliers is           : 66.62077791437515


Insight:

In the sold column, the mean and median are quite close, indicating that the data distribution may tend to be symmetrical. It was also found in skewness that the value was negative, namely that the data distribution was very skewed to the left (skeweness). Also in kurtosis, a very high level of sharpness in the data distribution was found (leptokurtic distribution). Judging from the standard deviation value, the distribution of the data is less spread than the average, meaning the data is less varied

This shows that the rating data for seblak products on Tokopedia has a fairly symmetrical distribution.

### **minimum and maximum potential income**

In [45]:
# confidence interval to get the lower value and upper value of the population income distribution
# Earn revenue by multiplying price by sold (price x sold)
pendapatan =  df["harga"] * df["terjual"]
std_pendapatan = pendapatan.std()

N = len(df)
low, up = stats.norm.interval(0.95,loc=pendapatan.mean(),scale=std_pendapatan/np.sqrt(N))
print('Lower value:',low)
print('Upper value:',up)

Lower value: 4708428.037375594
Upper value: 8624135.64213134


In [46]:
# Calculates 7% of income
komisi_low = low * 7/100
komisi_upper = up * 7/100

print("Minimum potential income 7%:", komisi_low)
print("Maximum potential income 7%:", komisi_upper)

Minimum potential income 7%: 329589.9626162916
Maximum potential income 7%: 603689.4949491938


Insight:

Within 1 month,
- The lowest value for selling seblak is IDR 4,708,428 with a minimum potential income of 7%, namely IDR 329,589.
- And the top value from selling seblak is IDR 8,624,135 with a maximum potential income of 7%, namely IDR 603,689

### **Hipotesis 1**
**H0** = the price of goods in Jabodetabek and outside Jabodetabek is the same  
**H1** = prices of goods in Jabodetabek and outside Jabodetabek are different

In [20]:
# Defines Jabodetabek and Non_Jabodetabek variables
Jabodetabek = []
Non_Jabodetabek = []

# If the city includes Jabodetabek then the Jabodetabek column will add a value of 1, otherwise the value is 0
for x in df.kota:
    if x == "Jakarta Barat" or x == "Tangerang Selatan" or x == "Jakarta Selatan" or x == "Depok" or x == "Jakarta Timur" or x == "Kab. Bogor" or x == "Jakarta Pusat" or x == "Bekasi" or x == "Kab. Tangerang" or x == "Jakarta Utara" or x == "Kab. Bekasi" or x == "Tangerang" or x == "Bogor" or x == "":
        value = 1
        Jabodetabek.append(value)
    else :
        value = 0
        Jabodetabek.append(value)

# If the city is non-jabodetabek then the non_jabodetabek column will add a value of 1, otherwise the value is 0
for x in df.kota:
    if x == "Jakarta Barat" or x == "Tangerang Selatan" or x == "Jakarta Selatan" or x == "Depok" or x == "Jakarta Timur" or x == "Kab. Bogor" or x == "Jakarta Pusat" or x == "Bekasi" or x == "Kab. Tangerang" or x == "Jakarta Utara" or x == "Kab. Bekasi" or x == "Tangerang" or x == "Bogor" or x == "":
        value = 0
        Non_Jabodetabek.append(value)
    else :
        value = 1
        Non_Jabodetabek.append(value)

# Enter the Jabodetabek and Non_Jabodetabek variables into the dataframe
df["jabodetabek"] = Jabodetabek
df["non_jabodetabek"] = Non_Jabodetabek

In [21]:
# Calculate the average price of products from Jabodetabek and non_jabodetabek
print('Rata-rata harga barang di Jabodetabek =',df[df['jabodetabek']==1]['harga'].mean())
print('Rata-rata harga barang di Non-Jabodetabek =',df[df['non_jabodetabek']==1]['harga'].mean())

Rata-rata harga barang di Jabodetabek = 23097.238554216867
Rata-rata harga barang di Non-Jabodetabek = 23505.008547008547


In [22]:
# Test the hypothesis using the Two-Sample Independent Test
t_stat, p_val = stats.ttest_ind(df[df['jabodetabek']==1]['harga'],df[df['non_jabodetabek']==1]['harga'])
print('T-Statistic:',t_stat)
print('P-value:',p_val) #

T-Statistic: -0.19308963186200662
P-value: 0.8469493692653455


Insight:

The p-value obtained is greater than the significance level, namely 0.05. which can be interpreted that the null hypothesis (H0) is valid. There is not enough evidence to support the difference in prices of goods in Jabodetabek and outside Jabodetabek.

This means that the prices of goods in Jabodetabek and outside Jabodetabek are the same


### **Hipotesis 2**

Buyers are more likely to like products that are cheap?

In [25]:
# Testing the correlation between the number of items sold and the price of the item with Pearson, Spearman and Kendall Tau

corr_r, pval_p = stats.pearsonr(df['harga'], df['terjual'])
corr_rho, pval_s = stats.spearmanr(df['harga'], df['terjual'])
corr_tau, pval_k = stats.kendalltau(df['harga'], df['terjual'])

print(f"r-correlation: {corr_r:.2f}, p-value: {pval_p}")
print(f"rho-correlation: {corr_rho:.2f}, p-value: {pval_s}")
print(f"tau-correlation: {corr_tau:.2f}, p-value: {pval_k}")

r-correlation: -0.06, p-value: 0.13851856759815787
rho-correlation: -0.17, p-value: 9.725003313031936e-06
tau-correlation: -0.12, p-value: 8.138656729922831e-06


Insight:

Based on the Pearson correlation, it shows a weak relationship between liking (amount sold) and low-priced products, but a p-value greater than 0.05 shows this relationship is not significant.

However, the Spearman and Kendall Tau correlations show a moderate (negative) relationship with a small p-value (less than 0.05). In this case, there is quite strong evidence that people tend to like (sell more) products that are cheap.

In [24]:
# Testing the correlation between ratings and item prices with Pearson, Spearman and Kendall Tau

corr_r, pval_p = stats.pearsonr(df['harga'], df['rating'])
corr_rho, pval_s = stats.spearmanr(df['harga'], df['rating'])
corr_tau, pval_k = stats.kendalltau(df['harga'], df['rating'])

print(f"r-correlation: {corr_r:.2f}, p-value: {pval_p}")
print(f"rho-correlation: {corr_rho:.2f}, p-value: {pval_s}")
print(f"tau-correlation: {corr_tau:.2f}, p-value: {pval_k}")

r-correlation: 0.07, p-value: 0.0716081215248508
rho-correlation: 0.10, p-value: 0.009431285238520649
tau-correlation: 0.08, p-value: 0.008896670459469022


Insight:

The three correlations show a moderate relationship between liking (rating) and low-priced products. And the correlation value obtained is positive, which means there is a positive tendency between liking (rating) and products that are cheap.

It can be concluded that there is quite strong evidence that people tend to like (rating) products that are cheap

# **G. Conclusion**

1. You can increase your income by dropshipping seblak products on Tokopedia.
2. You can sell seblak in Jabodetabek and outside Jabodetabek based on the same item price.
3. You can also sell at a low price based on the level of favorability (lot of items sold and rating) of buyers towards the low price.
4. In 1 month the potential income you will get is 7%, namely from IDR 329,589 to IDR 603,689