# Reading from amazon-raw file and pre-processing

In [11]:
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from stemming.porter2 import stem
import networkx


file = open('./amazon-raw.txt', 'r', encoding='utf-8', errors='ignore')

amazonProducts = {}
(Id, ASIN, Title, Categories, Group, purchased_together, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = \
    ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
for line in file:
    line = line.strip()
    if(line.startswith("Id")):
        Id = line[3:].strip()
    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()
    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())
    elif(line.startswith("group")):
        Group = line[6:].strip()
    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()
    elif(line.startswith("similar")):
        ls = line.split()
        purchased_together = ' '.join([c for c in ls[2:]])
    elif(line.startswith("categories")):
        ls = line.split()
        Categories = ' '.join((file.readline()).lower() for i in range(int(ls[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ', Categories)
        Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))        
        Categories = ' '.join(stem(word) for word in Categories.split())
    elif(line.startswith("reviews")):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()
    elif (line==""):
        try:
            RawData = {}
            if (ASIN != ""):
                amazonProducts[ASIN]=RawData
            RawData['Id'] = Id            
            RawData['Title'] = Title
            RawData['Categories'] = ' '.join(set(Categories.split()))
            RawData['Group'] = Group
            RawData['purchased_together'] = purchased_together
            RawData['SalesRank'] = int(SalesRank)
            RawData['TotalReviews'] = int(TotalReviews)
            RawData['AvgRating'] = float(AvgRating)
            RawData['DegreeCentrality'] = DegreeCentrality
            RawData['ClusteringCoeff'] = ClusteringCoeff
        except NameError:
            continue
        (Id, ASIN, Title, Categories, Group, purchased_together, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = \
            ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
file.close()


[nltk_data] Downloading package stopwords to /Users/Mahesh
[nltk_data]     Reddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# considering only "book" from Group

In [12]:

amazonBooks = {}
for asin,RawData in amazonProducts.items():
    if (RawData['Group']=='Book'):
        amazonBooks[asin] = amazonProducts[asin]

for asin, RawData in amazonBooks.items(): 
    amazonBooks[asin]['purchased_together'] = \
        ' '.join([cp for cp in RawData['purchased_together'].split() \
            if cp in amazonBooks.keys()])


# creating purchased together Graph structure

In [13]:
purchased_TogetherGraph = networkx.Graph()
for asin,RawData in amazonBooks.items():
    purchased_TogetherGraph.add_node(asin)
    for a in RawData['purchased_together'].split():
        purchased_TogetherGraph.add_node(a.strip())
        similarity = 0        
        x = set((amazonBooks[asin]['Categories']).split())
        y = set((amazonBooks[a]['Categories']).split())
        xIy = x & y
        xUy = x | y
        if (len(xUy)) > 0:
            similarity = round(len(xIy)/len(xUy),2)
        purchased_TogetherGraph.add_edge(asin, a.strip(), weight=similarity)

In [18]:
purchased_TitleGraph = networkx.Graph()
for asin,RawData in amazonBooks.items():
    purchased_TogetherGraph.add_node(asin)
    for a in RawData['purchased_together'].split():
        purchased_TogetherGraph.add_node(a.strip())
        similarity = 0        
        x = set((amazonBooks[asin]['Title']).split())
        y = set((amazonBooks[a]['Title']).split())
        xIy = x & y
        xUy = x | y
        if (len(xUy)) > 0:
            similarity = round(len(xIy)/len(xUy),2)
        purchased_TitleGraph.add_edge(asin, a.strip(), weight=similarity)

In [14]:

degree = networkx.degree(purchased_TogetherGraph)
for asin in networkx.nodes(purchased_TogetherGraph):
    RawData = amazonBooks[asin]
    RawData['DegreeCentrality'] = int(degree[asin])
    ego = networkx.ego_graph(purchased_TogetherGraph, asin, radius=1)
    RawData['ClusteringCoeff'] = round(networkx.average_clustering(ego),2)
    amazonBooks[asin] = RawData

In [19]:
degree = networkx.degree(purchased_TitleGraph)
for asin in networkx.nodes(purchased_TitleGraph):
    RawData = amazonBooks[asin]
    RawData['DegreeCentrality'] = int(degree[asin])
    ego = networkx.ego_graph(purchased_TitleGraph, asin, radius=1)
    RawData['ClusteringCoeff'] = round(networkx.average_clustering(ego),2)
    amazonBooks[asin] = RawData

In [16]:

file_2 = open('./amazon-books.txt', 'w', encoding='utf-8', errors='ignore')
file_2.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" #+ "Copurchased\t" + 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for asin,RawData in amazonBooks.items():
    file_2.write(RawData['Id'] + "\t" + \
            asin + "\t" + \
            RawData['Title'] + "\t" + \
            RawData['Categories'] + "\t" + \
            RawData['Group'] + "\t" + \
            str(RawData['SalesRank']) + "\t" + \
            str(RawData['TotalReviews']) + "\t" + \
            str(RawData['AvgRating']) + "\t" + \
            str(RawData['DegreeCentrality']) + "\t" + \
            str(RawData['ClusteringCoeff']) + "\n")
file_2.close()

file_2 = open("purchased_Together.edgelist",'wb')
networkx.write_weighted_edgelist(purchased_TogetherGraph, file_2)
file_2.close()

In [20]:
file_2 = open('./amazon-books.txt', 'w', encoding='utf-8', errors='ignore')
file_2.write("Id\t" + "ASIN\t" + "Title\t" + 
        "Categories\t" + "Group\t" #+ "Copurchased\t" + 
        "SalesRank\t" + "TotalReviews\t" + "AvgRating\t"
        "DegreeCentrality\t" + "ClusteringCoeff\n")
for asin,RawData in amazonBooks.items():
    file_2.write(RawData['Id'] + "\t" + \
            asin + "\t" + \
            RawData['Title'] + "\t" + \
            RawData['Categories'] + "\t" + \
            RawData['Group'] + "\t" + \
            str(RawData['SalesRank']) + "\t" + \
            str(RawData['TotalReviews']) + "\t" + \
            str(RawData['AvgRating']) + "\t" + \
            str(RawData['DegreeCentrality']) + "\t" + \
            str(RawData['ClusteringCoeff']) + "\n")
file_2.close()

file_2 = open("purchased_Together_Title.edgelist",'wb')
networkx.write_weighted_edgelist(purchased_TitleGraph, file_2)
file_2.close()