In [1]:
import string
import pandas as pd
import re
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from stemming.porter2 import stem
import networkx
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pylab as plt
from IPython.display import Image
import community as community_louvain
from collections import Counter
from itertools import chain
#pyo.init_notebook_mode()
import plotly.io as pio
pio.renderers.default = 'browser'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/howechen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
print('current woring directory:', os.getcwd())

current woring directory: /Users/howechen/Project/ntu_sd6103_individual_assignment/amazon_product_db


In [4]:
raw_data_path = '../ignore/data/'
data_path = os.path.join(raw_data_path, 'amazon-meta.txt')

In [5]:
df = open (data_path, 'r', encoding='utf-8', errors= 'ignore')

## 数据集描述
数据是 2006 年夏天通过抓取亚马逊网站收集的。数据集提供了 548,552 种不同产品的产品元数据和评论元数据信息。数据集中的每个产品都有以下信息：

- 标题
- 销售排名
- 类似产品列表（与当前产品共同购买的产品）
- 详细的产品分类
- 产品评论：时间、客户、评分、投票数、认为评论有用的人数

数据格式为:

- ID： 产品编号（编号 0、......、548551）
- ASIN：亚马逊标准识别码： 亚马逊标准识别码是亚马逊网站分配的用于产品识别的 10 个字符的字母数字唯一标识符。
- 标题： 产品名称/标题
- 组： 产品组可以是图书、DVD、视频或音乐
- 销售排名：亚马逊销售排名代表产品与其主要类别中的其他产品相比的销售情况。排名越低，说明产品的销售情况越好。
- 类似： 共同购买产品的 ASIN，例如购买 X 的人也会购买 Y
- 类别： 产品所属类别在产品类别层次结构中的位置（用 | 分隔，类别 ID 在 [...] 中）
- 评论： 产品评论信息，如评论总数、平均评分和单个客户评论信息，包括时间、用户 ID、评分、评论总票数、有用性总票数（表示有多少人认为该评论有用）。

## 数据预处理

In [4]:
# Initialize a nested product dictionary that will hold cleaned up amazon product data. 
amazonProducts= {}

在进入网络分析之前，需要进行一些预处理来读取文件，并将 ASIN 作为关键字，将其他数据作为与 ASIN 相关的元数据。

- ID、ASIN、标题、销售排名、评论总数和平均评分与上述相同。
- 与 ASIN 相关的所有类别都会被串联起来，然后进行文本预处理：小写、词干、去除数字/标点符号、去除停止词、只保留唯一词。
- 类似 "字段中的共同购买的 ASIN 将被过滤，只保留与之相关的元数据的 ASIN。

In [5]:
# Read the data from the Amazon file and fill the amazonProducts nested dictionary
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

counter = 0
for line in df:
    line = line.strip()
    if(line.startswith("Id")): # a product block started
        Id = line[3:].strip()
    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()
    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())
    elif(line.startswith("group")):
        Group = line[6:].strip()
    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()
    elif(line.startswith("similar")):
        ls = line.split()
        Copurchased = ' '.join([c for c in ls[2:]])
    elif(line.startswith("categories")):
        ls = line.split()
        # print(ls)
        Categories = ' '.join((df.readline()).lower() for i in range(int(ls[1].strip())))
        # print(Categories)
        Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ',Categories)
        # print(Categories)
        Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))
        # print(Categories)
        Categories = ' '.join(stem(word) for word in Categories.split())
        # print(Categories)
    elif(line.startswith("reviews")):
        ls = line.split()
        # print(ls)
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip() # a product block ended
    elif (line==""): # write out fields to amazonProducts dictionary
        try:
            MetaData = {}
            if (ASIN != ""):
                amazonProducts[ASIN] = MetaData
            MetaData['Id'] = Id
            MetaData['Title'] = Title
            MetaData['Categories'] = ' '.join(set(Categories.split()))
            MetaData['Group'] = Group
            MetaData['Copurchased'] = Copurchased
            MetaData['SalesRank'] = int(SalesRank)
            MetaData['TotalReviews'] = int(TotalReviews)
            MetaData['AvgRating'] = float(AvgRating)
            MetaData['DegreeCentrality'] = DegreeCentrality
            MetaData['ClusteringCoeff'] = ClusteringCoeff
        except NameError:
            continue
        (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
        counter += 1
    # if counter == 100:
    #     break
df.close

<function TextIOWrapper.close()>

该平台的目标是提供最大量的各种产品，同时让顾客在购买后对产品进行评论。然而，该平台于 1994 年作为一家在线书店开始运营。因此，再加上需要减少网络中可能存在的节点数量，我们决定只关注图书类产品。为此，下一步将是过滤亚马逊产品字典，使其只包含 Group=Book 并将其写入 amazonBooks 字典。

In [13]:
#create book specific dictionary exclusively for books
amazonBooks = {}
for asin,metadata in amazonProducts.items():
    if (metadata['Group']=='Book'):
        amazonBooks[asin]=amazonProducts[asin]
        
#remove any copurchased items from copurchase list. If we don't have metadata associated with it
for asin, metadata in amazonBooks.items():
    amazonBooks[asin]['Copurchased']= ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonBooks.keys()])

使用亚马逊图书字典中的共同购买数据创建共同购买图结构如下：

- 节点：是 ASIN
- 边：如果两个 ASIN 共同购买，则存在于两个节点之间
- 边缘权重：基于类别相似性
- 相似度：这是对共同购买的任意两个 ASIN 之间的度量，可以用连接节点类别之间的共同词数除以两个连接节点类别中的总词数来计算。相似度范围从 0（最不相似）到 1（最相似）。

In [15]:
#create a product copurchase graph for analysis
#the graph nodes are product ASINs, the graph edge exists if two products were copurchased, with edge weight being a measure of category similarity between ASINs
copurchaseGraph = networkx.Graph()
for asin, metadata in amazonBooks.items():
    copurchaseGraph.add_node(asin)
    for a in metadata ['Copurchased'].split():
        copurchaseGraph.add_node(a.strip())
        similarity= 0
        n1= set((amazonBooks[asin]['Categories']).split())
        n2= set((amazonBooks[a]['Categories']).split())
        n1In2 = n1 & n2 #intersection: number of words that are common between categories of connected nodes
        n1Un2 = n1 | n2 #union: total number of words in both categories of connected nodes
        if (len(n1Un2)) > 0:
            similarity = round (len(n1In2)/len(n1Un2), 2)
        copurchaseGraph.add_edge(asin, a.strip(), weight = similarity)

在亚马逊图书字典中添加每个 ASIN 节点的图相关度量：度中心性和聚类系数。

In [16]:
# Get degree centrality and clustering coefficients of each node and add it to amazonBooks metadata.
dc = networkx.degree(copurchaseGraph)
for asin in networkx.nodes(copurchaseGraph):
    metadata = amazonBooks[asin]
    metadata['DegreeCentrality'] = int(dc[asin])
    ego = networkx.ego_graph(copurchaseGraph, asin, radius = 1)
    metadata['ClusteringCoeff'] = round(networkx.average_clustering(ego), 2)
    amazonBooks[asin] = metadata

现在，将亚马逊图书数据写入 amazon-books.txt 文件，并将共购图数据写入 amazon-books-copurchase.edgelist 文件。

In [17]:
# Write amazonBooks data to file
df1 = open('/Users/howechen/Project/eCommerce_DA/amazon-books.txt', 'w', encoding = 'utf-8', errors = 'ignore')
df1.write('Id\t' + 'ASIN\t' + 'Title\t'+
         'Categories\t' + 'Group\t' + 'Copurchased\t'+
         'SalesRank\t' + 'TotalReviews\t' + 'AvgRating\t'+
         'DegreeCentrality\t' + 'ClusteringCoeff\n')
for asin, metadata in amazonBooks.items():
    df1.write(metadata['Id'] + '\t' + 
              asin + '\t' +
              metadata['Title'] + '\t' +
              metadata['Categories'] + '\t' +
              metadata['Group'] + '\t' +
              metadata['Copurchased'] +'\t' + 
              str(metadata['SalesRank']) + '\t' +
              str(metadata['TotalReviews']) + '\t' +
              str(metadata['AvgRating']) + '\t' +
              str(metadata['DegreeCentrality']) + '\t' +
              str(metadata['ClusteringCoeff']) + '\n')
df1.close()

# Write copurchaseGraph to file
df1 = open('/Users/howechen/Project/eCommerce_DA/amazon-books-copurchase.edgelist', 'wb')
networkx.write_weighted_edgelist(copurchaseGraph, df1)
df1.close()

## 绘图

In [18]:
df2= open('amazon-books-copurchase.edgelist')
copurchaseGraph = nx.read_weighted_edgelist(df2)
df2.close()

In [19]:
def degree_rank(net):
    degree_sequence = sorted((d for n, d in net.degree()), reverse=True)
    df = pd.DataFrame(degree_sequence, columns = ['Degree'])
    fig = px.scatter(df, y = 'Degree')
    fig.update_layout(xaxis_title="Rank")
    fig.update_layout({
        'plot_bgcolor' : 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    return fig
degree_rank(copurchaseGraph)

In [20]:
def degree_hist(net, kind = None, group = False):
    if kind == 'in':  degrees = net.in_degree()
    elif kind == 'out':  degrees = net.out_degree()
    else: degrees = net.degree()
   
    df = pd.DataFrame(degrees, columns = ['Node', 'Degree'])
    nbins = None if group else len(degrees)
    fig = px.histogram(df, x="Degree", nbins = nbins )
    fig.update_layout(yaxis_title="# of Nodes", bargap=0.01)
    fig.update_layout({
        'plot_bgcolor' : 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    return fig  

degree_hist(copurchaseGraph)

## Data Parser

In [6]:
df = open (data_path, 'r', encoding='utf-8', errors= 'ignore')

<!-- 
Id:   15
ASIN: 1559362022
  title: Wake Up and Smell the Coffee
  group: Book
  salesrank: 518927
  similar: 5  1559360968  1559361247  1559360828  1559361018  0743214552
  categories: 3
   |Books[283155]|Subjects[1000]|Literature & Fiction[17]|Drama[2159]|United States[2160]
   |Books[283155]|Subjects[1000]|Arts & Photography[1]|Performing Arts[521000]|Theater[2154]|General[2218]
   |Books[283155]|Subjects[1000]|Literature & Fiction[17]|Authors, A-Z[70021]|( B )[70023]|Bogosian, Eric[70116]
  reviews: total: 8  downloaded: 8  avg rating: 4
    2002-5-13  cutomer: A2IGOA66Y6O8TQ  rating: 5  votes:   3  helpful:   2
    2002-6-17  cutomer: A2OIN4AUH84KNE  rating: 5  votes:   2  helpful:   1
    2003-1-2  cutomer: A2HN382JNT1CIU  rating: 1  votes:   6  helpful:   1
    2003-6-7  cutomer: A2FDJ79LDU4O18  rating: 4  votes:   1  helpful:   1
    2003-6-27  cutomer: A39QMV9ZKRJXO5  rating: 4  votes:   1  helpful:   1
    2004-2-17  cutomer:  AUUVMSTQ1TXDI  rating: 1  votes:   2  helpful:   0
    2004-2-24  cutomer: A2C5K0QTLL9UAT  rating: 5  votes:   2  helpful:   2
    2004-10-13  cutomer:  A5XYF0Z3UH4HB  rating: 5  votes:   1  helpful:   1
    -->

In [7]:
# 函数1：提取category和category_id对应关系，并添加到category_l列表中
def extract_category_relations(line, category_l):
    parts = line.strip().split('|')[1:]  # 跳过第一个空字符串（因为每行以 | 开头）
    for part in parts:
        match = re.search(r'(.+?)\[(\d+)\]', part.strip())
        if match:
            category_name = re.sub(r'[^a-z]', '', match.group(1).lower())  # 仅保留字母的小写格式
            category_id = match.group(2)
            category_l.append({'category': category_name, 'category_id': category_id})

# 函数2：将分类序列按category_id提取为字符串格式
def extract_category_sequences(category_str):
    category_sequences = []
    lines = category_str.strip().split('\n')
    
    for line in lines:
        sequence = []
        parts = line.strip().split('|')[1:]  # 跳过第一个空字符串（因为每行以 | 开头）
        
        for part in parts:
            match = re.search(r'\[(\d+)\]', part.strip())
            if match:
                category_id = match.group(1)
                sequence.append(category_id)
        
        category_sequences.append("/".join(sequence))  # 用"/"连接每个category_id

    # 将所有分类序列用";"连接
    return ";".join(category_sequences)

# 函数3：解析评论信息并添加到 review_l 列表中
def extract_reviews(review_str, product_ASIN, review_l):
    # 分割每行评论
    lines = review_str.strip().split('\n')
    
    for line in lines:
        # 使用正则表达式提取评论中的字段
        match = re.search(
            r'(\d{4}-\d{1,2}-\d{1,2})\s+cutomer:\s+(\w+)\s+rating:\s+(\d+)\s+votes:\s+(\d+)\s+helpful:\s+(\d+)', 
            line.strip()
        )
        
        if match:
            date, customer_id, rating, votes, helpful = match.groups()
            # 创建评论字典并添加到 review_l 列表中
            review_l.append({
                'product_ASIN': product_ASIN,
                'date': date,
                'customer_id': customer_id,
                'rating': int(rating),
                'votes': int(votes),
                'helpful': int(helpful)
            })

In [8]:
amazonProducts = {}

category_l = []
review_l = []

(Id, ASIN, title, group, salesrank, similar_items_num, similar_items, Tags, category_sequence, total_reviews_number, avg_rating) = ("", "", "", "", 0, 0, "", "", "", 0, 0)

category_id_pattern = re.compile(r'[\d+]')
customer_id_pattern = re.compile(r'cutomer:\s(\s+)')
avg_rating_pattern = re.compile(r'avg rating:\s(\d+\.\d+)')
total_reviews_number_pattern = re.compile(r'total:\s(\d+)')

for line in df:
    line = line.strip()
    if line.startswith('Id'):
        Id = line[3:].strip()  # save to product_dict
    elif line.startswith('ASIN'):
        ASIN = line[5:].strip()  # save to product_dict
    elif line.startswith('title'):
        title = line[6:].strip()  # save to product_dict
    elif line.startswith('group'):
        group = line[6:].strip()
        group = group.lower()  # save to product_dict
    elif line.startswith('salesrank'):
        salesrank = int(line[10:].strip())  # save to product_dict
    elif line.startswith('similar'):
        similar_items_num = line[8:].strip()[0]    # similar_items are separated by space -> save to product_dict
        if similar_items_num != '0':
            similar_items = line[8:].strip()[2:]
        else:
            similar_items = ''
    elif line.startswith('categories'):
        ls = line.split()
        cate_records = ' '.join((df.readline()).lower() for i in range(int(ls[1].strip())))
        Tags = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ',cate_records)
        Tags = ' '.join(set(Tags.split())-set(stopwords.words("english")))
        Tags = ' '.join(stem(word) for word in Tags.split())
        extract_category_relations(cate_records, category_l)    # save to a temp list
        category_sequence = extract_category_sequences(cate_records)    # save to product dict
    elif line.startswith('reviews'):
        if avg_rating_pattern.search(line) and total_reviews_number_pattern.search(line):
            avg_rating = float(avg_rating_pattern.search(line).group(1))
            total_reviews_number = int(total_reviews_number_pattern.search(line).group(1))
            All_reviews = ' '.join((df.readline()).lower() for i in range(total_reviews_number))
            extract_reviews(All_reviews, ASIN, review_l)
        else:
            continue
    elif line == '':
        try:
            MetaData = {}
            if (ASIN != ""):
                amazonProducts[ASIN] = MetaData
            MetaData['Id'] = Id
            MetaData['Title'] = title
            MetaData['Group'] = group
            MetaData['SalesRank'] = salesrank
            MetaData['SimilarItemsNum'] = similar_items_num
            MetaData['SimilarItems'] = similar_items
            MetaData['CategorySequence'] = category_sequence
            MetaData['Tags'] = Tags
            MetaData['TotalReviewsNumber'] = total_reviews_number
            MetaData['AvgRating'] = avg_rating
        except NameError:
            continue
        (Id, ASIN, title, group, salesrank, similar_items_num, similar_items, Tags, category_sequence, total_reviews_number, avg_rating) = ("", "", "", "", 0, 0, "", "", "", 0, 0)

In [9]:
len(amazonProducts.keys()), len(category_l), len(review_l)

(540607, 12865906, 4081348)

In [10]:
amazonProducts_df = pd.DataFrame.from_dict(amazonProducts, orient='index')
amazonProducts_df.index.name = 'ASIN'
amazonProducts_df.reset_index(inplace=True)
amazonProducts_df.shape

(540607, 11)

In [11]:
amazonProducts_df.head()

Unnamed: 0,ASIN,Id,Title,Group,SalesRank,SimilarItemsNum,SimilarItems,CategorySequence,Tags,TotalReviewsNumber,AvgRating
0,771044445,0,,,0,0,,,,0,0.0
1,827229534,1,Patterns of Preaching: A Sermon Sampler,book,396585,5,0804215715 156101074X 0687023955 068707423...,283155/1000/22/12290/12360/12368;283155/1000/2...,spiritu subject clergi preach religion christi...,0,0.0
2,738700797,2,Candlemas: Feast of Flames,book,168596,5,0738700827 1567184960 1567182836 073870052...,283155/1000/22/12472/12484;283155/1000/22/1247...,spiritu earth subject wicca witchcraft religio...,12,4.5
3,486287785,3,World War II Allied Fighter Planes Trading Cards,book,1270652,0,,283155/1000/48/5126/5144,home subject garden hobbi craft general book,0,0.0
4,842328327,4,Life Application Bible Commentary: 1 and 2 Tim...,book,631289,5,0842328130 0830818138 0842330313 084232861...,283155/1000/22/12290/172810/12155/12159;283155...,bibl christian testament guid text bibl religi...,0,0.0


In [12]:
save_data_path = '../ignore/data/'

In [13]:
save_file_path = os.path.join(save_data_path, 'amazon_products_total.csv')
amazonProducts_df.to_csv(save_file_path, index=False)

### Process Category table

In [14]:
category_df = pd.DataFrame(category_l)
print(category_df.shape)
category_df = category_df.drop_duplicates()
print(category_df.shape)

(12865906, 2)
(49683, 2)


In [None]:
category_df.rename(columns={'category': 'Category', 'category_id': 'CategoryId'}, inplace=True)
category_df.head()

Unnamed: 0,Category,CategoryID
0,books,283155
1,subjects,1000
2,religionspirituality,22
3,christianity,12290
4,clergy,12360


In [21]:
category_df['Category'].nunique(), category_df['CategoryId'].nunique()

(25644, 49683)

In [22]:
for cate in category_df['Category'].unique():
    temp_df = category_df[category_df['Category'] == cate]
    if temp_df['CategoryId'].nunique() > 1:
        # print(cate, temp_df['category_id'].unique())
        dup_nums = len(temp_df['CategoryId'].unique())
        for i in range(dup_nums):
            category_df.loc[temp_df.index[i], 'Category'] = cate + '_' + str(i+1)

In [23]:
category_df['Category'].nunique(), category_df['CategoryId'].nunique()

(49683, 49683)

In [24]:
category_df

Unnamed: 0,Category,CategoryId
0,books_1,283155
1,subjects_1,1000
2,religionspirituality_1,22
3,christianity_1,12290
4,clergy,12360
...,...,...
12855926,varsidiane_2,450660
12856896,daytonlyman,456102
12857969,kermanken_2,431222
12860524,mathewscarole,435792


In [25]:
category_df = category_df[['CategoryId', 'Category']]

In [26]:
category_df = category_df.sort_values(by='CategoryId')

In [27]:
save_file_path = os.path.join(save_data_path, 'amazon_category.csv')
category_df.to_csv(save_file_path, index=False)

### Process Review Table

In [28]:
review_df = pd.DataFrame(review_l)
print(review_df.shape)

(4081348, 6)


In [29]:
review_df.rename(columns={'product_ASIN': 'ASIN',
                          'date':'Date',
                          'customer_id':'CustomerId',
                          'rating':'Rating',
                          'votes':'Votes',
                          'helpful':'Helpful'}, inplace=True)
review_df.head()

Unnamed: 0,ASIN,Date,CustomerId,Rating,Votes,Helpful
0,738700797,2001-12-16,a11nco6yte4btj,5,5,4
1,738700797,2002-1-7,a9cq3plrnir83,4,5,5
2,738700797,2002-1-24,a13sg9acz9o5im,5,8,8
3,738700797,2002-1-28,a1bdai6veymaza,5,4,4
4,738700797,2002-2-6,a2p6kawxj16234,4,16,16


In [30]:
review_df['Date'] = pd.to_datetime(review_df['Date'])

In [31]:
review_df = review_df[['ASIN', 'CustomerId', 'Date', 'Rating', 'Votes', 'Helpful']]

In [33]:
review_df['ASIN'].nunique(), review_df['CustomerId'].nunique()

(144037, 923593)

In [34]:
save_file_path = os.path.join(save_data_path, 'amazon_reviews.csv')
review_df.to_csv(save_file_path, index=False)

## Process Co-purchase Relations

In [35]:
amazonProducts_df['Group'].value_counts()

Group
book            388472
music           101612
video            25505
dvd              19215
                  5783
toy                  8
software             5
ce                   4
video games          1
baby product         1
sports               1
Name: count, dtype: int64

In [36]:
amazonBooks = {}
for asin,metadata in amazonProducts.items():
    if (metadata['Group']=='book'):
        amazonBooks[asin]=amazonProducts[asin]

amazonMusic = {}
for asin,metadata in amazonProducts.items():
    if (metadata['Group']=='music'):
        amazonMusic[asin]=amazonProducts[asin]

amazonVideo = {}
for asin,metadata in amazonProducts.items():
    if (metadata['Group']=='video'):
        amazonVideo[asin]=amazonProducts[asin]

for asin, metadata in amazonBooks.items():
    amazonBooks[asin].update({'Copurchased': ' '.join([cp for cp in metadata['SimilarItems'].split() if cp in amazonBooks.keys()])})
    # amazonBooks[asin]['Copurchased']= ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonBooks.keys()])

for asin, metadata in amazonMusic.items():
    amazonMusic[asin].update({'Copurchased': ' '.join([cp for cp in metadata['SimilarItems'].split() if cp in amazonMusic.keys()])})
    # amazonMusic[asin]['Copurchased']= ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonMusic.keys()])

for asin, metadata in amazonVideo.items():
    amazonVideo[asin].update({'Copurchased': ' '.join([cp for cp in metadata['SimilarItems'].split() if cp in amazonVideo.keys()])})
    # amazonVideo[asin]['Copurchased']= ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonVideo.keys()])

In [37]:
#create a product copurchase graph for analysis
#the graph nodes are product ASINs, the graph edge exists if two products were copurchased, with edge weight being a measure of category similarity between ASINs
def create_copurchase_graph(targetCategory):
    copurchaseGraph = networkx.Graph()
    for asin, metadata in targetCategory.items():
        copurchaseGraph.add_node(asin)
        for a in metadata ['Copurchased'].split():
            copurchaseGraph.add_node(a.strip())
            similarity= 0
            n1= set((targetCategory[asin]['Tags']).split())
            n2= set((targetCategory[a]['Tags']).split())
            n1In2 = n1 & n2 #intersection: number of words that are common between categories of connected nodes
            n1Un2 = n1 | n2 #union: total number of words in both categories of connected nodes
            if (len(n1Un2)) > 0:
                similarity = round (len(n1In2)/len(n1Un2), 2)
            copurchaseGraph.add_edge(asin, a.strip(), weight = similarity)
            
    return copurchaseGraph

In [38]:
def add_degree_centrality_clustering_coeff(targetCategory, copurchaseGraph):
    dc = networkx.degree(copurchaseGraph)
    for asin in networkx.nodes(copurchaseGraph):
        metadata = targetCategory[asin]
        metadata.update({'DegreeCentrality': int(dc[asin])})
        ego = networkx.ego_graph(copurchaseGraph, asin, radius = 1)
        metadata.update({'ClusteringCoeff': round(networkx.average_clustering(ego), 2)})
        targetCategory[asin] = metadata

### Save Files

In [39]:
copurchaseGraphBooks = create_copurchase_graph(amazonBooks)
add_degree_centrality_clustering_coeff(amazonBooks, copurchaseGraphBooks)

In [40]:
# Write copurchaseGraph to file
save_file_path = os.path.join(save_data_path, 'amazon-books-copurchase-edgelist.csv')
edges = [(u, v, d['weight']) for u, v, d in copurchaseGraphBooks.edges(data=True)]
edges_df = pd.DataFrame(edges, columns=['source', 'target', 'weight'])
edges_df.to_csv(save_file_path, index=False)

In [41]:
copurchaseGraphMusic = create_copurchase_graph(amazonMusic)
add_degree_centrality_clustering_coeff(amazonMusic, copurchaseGraphMusic)

In [42]:
# Write copurchaseGraph to file
save_file_path = os.path.join(save_data_path, 'amazon-music-copurchase-edgelist.csv')
edges = [(u, v, d['weight']) for u, v, d in copurchaseGraphMusic.edges(data=True)]
edges_df = pd.DataFrame(edges, columns=['source', 'target', 'weight'])
edges_df.to_csv(save_file_path, index=False)

In [43]:
copurchaseGraphVideo = create_copurchase_graph(amazonVideo)
add_degree_centrality_clustering_coeff(amazonVideo, copurchaseGraphVideo)

In [44]:
# Write copurchaseGraph to file
save_file_path = os.path.join(save_data_path, 'amazon-video-copurchase-edgelist.csv')
edges = [(u, v, d['weight']) for u, v, d in copurchaseGraphVideo.edges(data=True)]
edges_df = pd.DataFrame(edges, columns=['source', 'target', 'weight'])
edges_df.to_csv(save_file_path, index=False)

In [45]:
amazonBooks_df = pd.DataFrame.from_dict(amazonBooks, orient='index')
amazonBooks_df.index.name = 'ASIN'
amazonBooks_df.reset_index(inplace=True)
amazonBooks_df.shape

(388472, 14)

In [46]:
save_file_path = os.path.join(save_data_path, 'amazon_books_with_sna.csv')
amazonBooks_df.to_csv(save_file_path, index=False)

In [47]:
amazonMusic_df = pd.DataFrame.from_dict(amazonMusic, orient='index')
amazonMusic_df.index.name = 'ASIN'
amazonMusic_df.reset_index(inplace=True)
amazonMusic_df.shape

(101612, 14)

In [48]:
save_file_path = os.path.join(save_data_path, 'amazon_music_with_sna.csv')
amazonMusic_df.to_csv(save_file_path, index=False)

In [49]:
amazonVideo_df = pd.DataFrame.from_dict(amazonVideo, orient='index')
amazonVideo_df.index.name = 'ASIN'
amazonVideo_df.reset_index(inplace=True)
amazonVideo_df.shape

(25505, 14)

In [50]:
save_file_path = os.path.join(save_data_path, 'amazon_video_with_sna.csv')
amazonVideo_df.to_csv(save_file_path, index=False)