In [1]:
#!pip install NetworkX
import networkx as nx
import numpy as np
import sklearn
import pandas as pd

In [2]:
# Part 1: Exploratory Social Network Analysis

In [3]:
#(a)
Edges = pd.read_csv("C:/Users/myw/Desktop/Fall 2022/SI 671/homework3/amazonNetwork.csv")
G = nx.from_pandas_edgelist(Edges, source='FromNodeId',target='ToNodeId', create_using=nx.DiGraph())

In [4]:
#(b)
print("Numer of items in the network:", G.number_of_nodes())
print("Numer of co-purchases in the network:", G.number_of_edges())

Numer of items in the network: 2647
Numer of co-purchases in the network: 10841


In [5]:
#(c) The result shows that the path from one node to another node is 9.59 steps in length in average.
nx.average_shortest_path_length(G,weight=None)

9.592795477759587

In [6]:
#(d)
print("The transitivity of G: ",nx.transitivity(G))
#For graph G, 43.39 percent of all possible directed edges between any two of its nodes exist.
print("The average clustering coefficient of G: ",nx.average_clustering(G))
#In  average, for a node, 40.86 percent of all possible directed edges between any two of its neighbors exist.

The transitivity of G:  0.4339169154480595
The average clustering coefficient of G:  0.4086089178720651


In [7]:
#(e) The ten nodes with highest pagerank scores are: number 8,481,33,18,23,30,346,99,93,21
pageranks = nx.pagerank(G,alpha=0.5)
pageranks = {k:v for k, v in sorted(pageranks.items(), key=lambda t: t[1], reverse=True)}
highest_pagerank_nodes = list(pageranks.keys())[0:10]
count = 1
for node in highest_pagerank_nodes:
    print("{}. {}".format(count, node))
    count += 1

1. 8
2. 481
3. 33
4. 18
5. 23
6. 30
7. 346
8. 99
9. 93
10. 21


In [8]:
#Part 2: Predicting Review-Rating using Features derived from network properties 

In [9]:
reviewTrain = pd.read_csv("C:/Users/myw/Desktop/Fall 2022/SI 671/homework3/reviewTrain.csv")

In [10]:
#Extracting local clustering coefficients from the network with networkX.clustering() method
#Local clustering coefficient quantifies how close its neighbours are to being a clique. If a node has a higher
#local clustering coefficient, this node and its neighbors may be prefered by a specific group of people because
#they are often co-purchased, so it may have a higher review score.
clustering_coefficients = pd.DataFrame.from_dict(nx.clustering(G),orient='index')
clustering_coefficients["id"] = list(clustering_coefficients.index)
clustering_coefficients.columns = ["Clustering Coefficients","id"]
reviewTrain = reviewTrain.merge(clustering_coefficients,how="left")

In [11]:
#Extracting page rank of each ID from the network with networkX.pagerank() method
#A higher page rank score means an item was purchased more times, that is to say, it is more popular, 
#and thus it is likely to receive a higher review score.
pagerank_items = pd.DataFrame(nx.pagerank(G,alpha=0.5).items())
pagerank_items.columns = ["id","PageRanks"]
reviewTrain = reviewTrain.merge(pagerank_items,how="left")

In [12]:
# Extracting degree centrality
#A higher degree centrality means an item is more often being co-purchased with other items, i.e. it's more
#popular, so it is likely to have a higher review score.
degree_centrality = pd.DataFrame.from_dict(nx.degree_centrality(G),orient='index')
degree_centrality["id"] = list(degree_centrality.index)
degree_centrality.columns = ["Degree Centrality","id"]
reviewTrain = reviewTrain.merge(degree_centrality,how="left")

In [13]:
# Extracting betweenness centrality
#A node with higher betweenness centrality can tell us more information about co-purchasing.
#Thus I think it can be an influential feature.
betweenness_centrality = pd.DataFrame.from_dict(nx.betweenness_centrality(G),orient='index')
betweenness_centrality["id"] = list(betweenness_centrality.index)
betweenness_centrality.columns = ["Betweenness_Centrality","id"]
reviewTrain = reviewTrain.merge(betweenness_centrality,how="left")

In [14]:
#Drop those ids which are not in the network, and also drop those with 0.0 review because 0.0 is identical to na.
reviewTrain = reviewTrain[reviewTrain.PageRanks.isna()==False]
reviewTrain = reviewTrain[reviewTrain.review!=0]

In [15]:
#I think different groups of items may have a different model on review scores,
#because different groups form different clusters in the network, and the four features above are more similar
#within cluster. Thus I transformed 'group' into dummy variables, and added them as feature.
dummy_df = pd.get_dummies(reviewTrain.group)
reviewTrain = reviewTrain.join(dummy_df)
reviewTrain

Unnamed: 0,id,title,group,review,Clustering Coefficients,PageRanks,Degree Centrality,Betweenness_Centrality,Book,DVD,Music,Toy,Video
0,3,World War II Allied Fighter Planes Trading Cards,Book,5.0,0.450000,0.000197,0.001890,0.000000e+00,1,0,0,0,0
2,7,Batik,Music,4.5,0.109562,0.001263,0.008692,1.876848e-02,0,0,1,0,0
3,10,The Edward Said Reader,Book,4.0,0.285714,0.000424,0.003779,3.049242e-03,1,0,0,0,0
4,11,Resetting the Clock : Five Anti-Aging Hormone...,Book,5.0,0.120344,0.000906,0.010204,8.756193e-03,1,0,0,0,0
5,12,Fantastic Food with Splenda : 160 Great Recip...,Book,4.5,0.424658,0.000506,0.004913,2.147989e-03,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1647,2632,The Essential Golden Dawn: An Introduction to...,Book,4.5,0.722222,0.000265,0.001890,0.000000e+00,1,0,0,0,0
1649,2638,Comprehensive Curriculum of Basic Skills: Gra...,Book,4.5,0.392857,0.000257,0.002268,1.488339e-03,1,0,0,0,0
1650,2641,Christian Ethics,Book,4.0,0.888889,0.000236,0.001890,0.000000e+00,1,0,0,0,0
1651,2642,"Social, Emotional, and Personality Developmen...",Book,5.0,0.333333,0.000236,0.001134,2.524420e-04,1,0,0,0,0


In [16]:
#Prepare feature data, response data, and 10 folds for cross-validation.
from sklearn.model_selection import KFold
x = reviewTrain[["Clustering Coefficients","PageRanks","Degree Centrality","Betweenness_Centrality",
                          " Book"," DVD"," Music"," Toy"," Video"]]
y = reviewTrain.review
kf = KFold(n_splits=10,random_state=671,shuffle=True)

In [17]:
#First I tried Linear Regression, and the average MAE with cross-validation is 0.563.
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
Linreg = LinearRegression()
#Fit the model, and calculating average MAE with k folds.
MAE = []
for train_index, test_index in kf.split(x):
    Linreg.fit(x.iloc[train_index].values,y.iloc[train_index].values)
    MAE.append(mae(Linreg.predict(x.iloc[test_index].values),y.iloc[test_index].values))
np.average(MAE)

0.5630753476955748

In [18]:
#Then I tried SVR (also tried several sets of parameters), 
#and the smallest average MAE with cross-validation is 0.536, smaller than that of linear regression.
from sklearn.svm import SVR
SVRreg = SVR(kernel="sigmoid", C=1, gamma=0.19, epsilon=0.01,cache_size=1000)
MAE = []  
for train_index, test_index in kf.split(x):
    SVRreg.fit(x.iloc[train_index].values,y.iloc[train_index].values)
    MAE.append(mae(SVRreg.predict(x.iloc[test_index].values),y.iloc[test_index].values))
np.average(MAE)

0.5359499254527175

In [19]:
#Then I tried Random Forest Regressor, and max depth being 4 gives the smallest average MAE, which is 0.563
#almost the same as linear regression, but did not beat SVR.
from sklearn.ensemble import RandomForestRegressor
RFreg = RandomForestRegressor(max_depth=4, random_state=0)
MAE = []  
for train_index, test_index in kf.split(x):
    RFreg.fit(x.iloc[train_index].values,y.iloc[train_index].values)
    MAE.append(mae(RFreg.predict(x.iloc[test_index].values),y.iloc[test_index].values))
np.average(MAE)

0.5629074728851621

In [20]:
#The last regression model I tried was Multi-Layer Perceptron. The model converged after 112 iterations, and
#the average MAE is 0.566, the worst among four regression models.
from sklearn.neural_network import MLPRegressor
MLPreg = MLPRegressor(random_state=1, max_iter=112)
MAE = []
for train_index, test_index in kf.split(x):
    MLPreg.fit(x.iloc[train_index].values,y.iloc[train_index].values)
    MAE.append(mae(MLPreg.predict(x.iloc[test_index].values),y.iloc[test_index].values))
np.average(MAE)

0.5659029662409704

In [21]:
#Then I also tried several classification models, attempting to treat this problem as a classfication problem.
#Double the response data so that it can be discrete.
y_int = pd.Series((2*y).astype(int))

In [22]:
#First I tried Logistic Regression. The average MAE with cross-validation is 0.656,
#which is much higher than all regression models above.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
MAE = []
for train_index, test_index in kf.split(x):
    lr.fit(x.iloc[train_index].values,y_int.iloc[train_index].values)
    MAE.append(mae(lr.predict(x.iloc[test_index].values)/2,y.iloc[test_index].values))
np.average(MAE)

0.6563354838709679

In [23]:
#Then I tried SVC, and the average MAE is 0.666, even worse.
from sklearn.svm import SVC
SVCClf = SVC(kernel = 'rbf',gamma = 'scale')
MAE = []
for train_index, test_index in kf.split(x):
    SVCClf.fit(x.iloc[train_index].values,y_int.iloc[train_index].values)
    MAE.append(mae(SVCClf.predict(x.iloc[test_index].values)/2,y.iloc[test_index].values))
np.average(MAE)

0.6655516129032257

In [24]:
#Then I tried MLP regressor, and it still gived out average MAE much bigger than regression models.
from sklearn.neural_network import MLPClassifier
MLPClf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
MAE = []
for train_index, test_index in kf.split(x):
    MLPClf.fit(x.iloc[train_index].values,y_int.iloc[train_index].values)
    MAE.append(mae(MLPClf.predict(x.iloc[test_index].values)/2,y.iloc[test_index].values))
np.average(MAE)

0.6820645161290323

In [25]:
# The last classification model I tried is Random Forest Classifier, and the average MAE is 0.661,
#beaten by any regression model.
from sklearn.ensemble import RandomForestClassifier
RFClf = RandomForestClassifier(max_depth=10, random_state=0)
MAE = []
for train_index, test_index in kf.split(x):
    RFClf.fit(x.iloc[train_index].values,y_int.iloc[train_index].values)
    MAE.append(mae(RFClf.predict(x.iloc[test_index].values)/2,y.iloc[test_index].values))
np.average(MAE)

0.6610677419354839

In [26]:
#Thus, if our criterion of model performance is based on mean absolute error,
#it's better to treat this problem as a regression problem.
#Since SVR showed the best performance, I will pick it to predict on reviewTest dataset.

In [34]:
#Extracting selected features above on reviewTest dataset
reviewTest = pd.read_csv("C:/Users/myw/Desktop/Fall 2022/SI 671/homework3/reviewTest.csv")
reviewTest = reviewTest.merge(clustering_coefficients,how="left")
reviewTest = reviewTest.merge(pagerank_items,how="left")
reviewTest = reviewTest.merge(degree_centrality,how="left")
reviewTest = reviewTest.merge(betweenness_centrality,how="left")
dummy_df = pd.get_dummies(reviewTest.group)
reviewTest = reviewTest.join(dummy_df)
#Add a column to keep a constant dimension
reviewTest[" Toy"] = pd.Series(np.zeros(len(reviewTest)).astype(int))

In [35]:
#Preparing predicting data
X = reviewTest[["Clustering Coefficients","PageRanks","Degree Centrality","Betweenness_Centrality",
                          " Book"," DVD"," Music"," Toy"," Video"]]
#Fill na with mean values
X = X.fillna(X.mean())
reviewTest.review = SVRreg.predict(X)
reviewTest.review



0      4.509256
1      4.509638
2      4.470729
3      4.684855
4      4.494984
         ...   
995    4.550003
996    4.509486
997    4.496170
998    4.506116
999    4.509141
Name: review, Length: 1000, dtype: float64