In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
import os
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cereal_urls=["http://www.walmart.com/search/?query=cereal&page="+str(i)+"&cat_id=0" for i in range(1,51)]
# by observation, when searching "cold cereal", results stop making sense after page 33
# so we just discard the rest of the data
cold_cereal_urls=["http://www.walmart.com/search/?query=cold+cereal&page="+str(i)+"&cat_id=0" for i in range(1,34)]

In [3]:
# for each item, return a [brand,star,numReviews] list
# if the item's brand is not as specified, name "other"
def parseSoup(soup):
    info=[""]*3
    brand_name=soup.find("a",{"class":"js-product-title"}).text.split()
    rating_and_review=soup.find("div",{"class":"stars stars-small tile-row"})
    if(rating_and_review==None):
        rating_and_review=["NA","stars","(NA)","ratings"]
    else:
        rating_and_review=rating_and_review.text.split()
    for i in range(len(brand_name)):
        if brand_name[i]=="Cheerios" or brand_name[i]=="Kashi"or brand_name[i]=="Kellogg's" or brand_name[i]=="Post":
            info[0]=brand_name[i]
            break
    if(info[0]==""):
        info[0]="Other"
    info[1]=rating_and_review[0]
    info[2]=rating_and_review[2][1:-1]
    return info

In [4]:
# create two lists of all items on Walmart.com using the search term "cold cereal" and "cereal"
# each sublist in the list is of the format [brand,star,numReviews]
time=datetime.now().replace(microsecond=0)
all_cold_cereal_list=[]
for url in cold_cereal_urls:
    cereal_soup=BeautifulSoup(requests.get(url).content,"lxml")
    g_data=cereal_soup.findAll("div",{"class":"tile-content-wrapper"})
    for i in range(len(g_data)):
        info=parseSoup(g_data[i])
        all_cold_cereal_list.append(info)

        
all_cereal_list=[]
for url in cereal_urls:
    cereal_soup=BeautifulSoup(requests.get(url).content,"lxml")
    g_data=cereal_soup.findAll("div",{"class":"tile-content-wrapper"})
    for i in range(len(g_data)):
        info=parseSoup(g_data[i])
        all_cereal_list.append(info)
        
# compute the total number of all/top3 results for each brand
cereal_total={"Kellogg's":0,"Post":0,"Kashi":0,"Cheerios":0,"Other":0}
cereal_top3={"Kellogg's":0,"Post":0,"Kashi":0,"Cheerios":0,"Other":0}
cold_cereal_total={"Kellogg's":0,"Post":0,"Kashi":0,"Cheerios":0,"Other":0}
cold_cereal_top3={"Kellogg's":0,"Post":0,"Kashi":0,"Cheerios":0,"Other":0}
for i in range(len(all_cereal_list)):
    cereal_total[all_cereal_list[i][0]]+=1
    if(i<3):
        cereal_top3[all_cereal_list[i][0]]+=1
for i in range(len(all_cold_cereal_list)):
    cold_cereal_total[all_cold_cereal_list[i][0]]+=1
    if(i<3):
        cold_cereal_top3[all_cold_cereal_list[i][0]]+=1


In [5]:
# update the dataset
add_header=False
if(not os.path.exists("Cereal_Data.csv")):
    add_header=True
    
idx=["(cold_cereal, top3)","(cold_cereal, all)","(cereal, top3)","(cereal, all)"]
df=pd.DataFrame(0,index=idx,columns=["Cheerios","Kashi","Kellogg's","Other","Post"])

df.loc["(cold_cereal, top3)"]=[i[1] for i in sorted(cold_cereal_top3.items())]
df.loc["(cold_cereal, all)"]=[i[1] for i in sorted(cold_cereal_total.items())]
df.loc["(cereal, top3)"]=[i[1] for i in sorted(cereal_top3.items())]
df.loc["(cereal, all)"]=[i[1] for i in sorted(cereal_total.items())]
df["Time"]=time

# may encounter some formatting issue if the file has been modified outside this program
with open("Cereal_Data.csv","a") as f:
    df.to_csv(f,header=add_header)


In [6]:
ranking_and_star=[]
# assume the correlation between ranking and star is independent of search word
# so combine data for the two search terms
for i in range(len(all_cold_cereal_list)):
    if(all_cold_cereal_list[i][1]!="NA"):
        ranking_and_star.append([i+1,all_cold_cereal_list[i][1]])
for i in range(len(all_cereal_list)):
    if(all_cereal_list[i][1]!="NA"):
        ranking_and_star.append([i+1,all_cereal_list[i][1]])
ranking_for_star=[item[0] for item in ranking_and_star]
star=[item[1] for item in ranking_and_star]

In [20]:
# can explore the ranking correlation within any ranking range, e.g. top 100, top 200...
# for now, assume the range is all data
minVal=min(ranking_for_star)
maxVal=max(ranking_for_star)
intervals=5
distance=int((maxVal-minVal)/5)
cutoffs=[minVal+i*distance for i in range(intervals)]
cutoffs.append(maxVal)

In [69]:
rough_ranking_for_star_dict={i:[] for i in range(1,intervals+1)}
for i in range(len(ranking_for_star)):
    for j in range(1,intervals+1):
        if(ranking_for_star[i]<=cutoffs[j]):
            rough_ranking_for_star_dict[j].append(star[i])
            break
            

In [81]:
all_keys=list(sorted(rough_ranking_for_star_dict.keys()))

possible_stars=[i*0.5 for i in range(0,11)]

indices=[(i,j) for i in all_keys for j in possible_stars]

data=[rough_ranking_for_star_dict[i[0]].count(str(i[1])) for i in indices]
radius=[np.pi*i for i in data]

x,y=zip(*indices)

fig=plt.figure(figsize=[5,5])
fig.suptitle("rough ranking and stars",fontsize=14,fontweight="bold")
plt.scatter(x,y,s=data,c=np.random.rand(len(x)),alpha=0.5)