## GMO network matrix building code for the Chinese paper


- For Anfan & Lyndon's paper


- By Lyndon, on Feb. 27, 2020

In [1]:
import warnings
import collections
import pandas as pd
import networkx as nx
from ast import literal_eval
from itertools import combinations

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)

---
- I changed the original dataset a little bit.
---

In [3]:
data_file = './data_CN/CNpaper.xlsx'
data = pd.read_excel(data_file, header=0)
# data.head()

In [4]:
data.columns

Index(['NO.', 'Unnamed: 1', '_id_x', 'nick_name', 'gender', 'province', 'age2',
       'birthday', 'tweet-level', 'tweets_num', 'follows-level', 'follows_num',
       'fans-level', 'fans_num', 'attitude', 'content', 'Issue1', 'Issue2',
       'Issue3', 'Issue4', 'Issue5', 'Issue6', 'Issue7', 'Issue8', 'Issue9',
       'Issue10', 'Issue11', 'Issue12', 'Issue13', 'Issue14', 'Issue15',
       'Issue16', 'vip_level', 'authentication', 'person_url', '_id_y',
       'user_id', 'created_at'],
      dtype='object')

In [5]:
data.dtypes

NO.                 int64
Unnamed: 1          int64
_id_x               int64
nick_name          object
gender              int64
province            int64
age2                int64
birthday           object
tweet-level       float64
tweets_num          int64
follows-level     float64
follows_num         int64
fans-level        float64
fans_num            int64
attitude            int64
content            object
Issue1             object
Issue2             object
Issue3             object
Issue4             object
Issue5             object
Issue6              int64
Issue7             object
Issue8             object
Issue9             object
Issue10            object
Issue11             int64
Issue12            object
Issue13            object
Issue14             int64
Issue15            object
Issue16            object
vip_level          object
authentication     object
person_url         object
_id_y              object
user_id             int64
created_at         object
dtype: objec

### 1. Extract the necessary columns

In [6]:
issue_idx = list()
for idx in range(1, 17):
    issue_idx.append('Issue' + str(idx))
issue_idx.insert(0, 'attitude')
data_1 = data[issue_idx]
data_1.head()

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


### 2. Drop the post(s) with only one issue

In [7]:
data_2 = pd.DataFrame()
for column in data_1.columns:
    data_2[column] = pd.to_numeric(data_1[column], errors='coerce', downcast='signed').fillna(0)
data_2.dtypes

attitude       int8
Issue1      float64
Issue2      float64
Issue3      float64
Issue4      float64
Issue5      float64
Issue6         int8
Issue7      float64
Issue8      float64
Issue9      float64
Issue10     float64
Issue11        int8
Issue12     float64
Issue13     float64
Issue14        int8
Issue15     float64
Issue16     float64
dtype: object

In [8]:
data_2

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16
0,1,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
1,2,0.0,0.0,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2,1,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
3,1,0.0,1.0,0.0,0.0,1.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
4,2,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7503,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,1.0,0,0.0,0.0
7504,2,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0
7505,2,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
7506,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,1.0,0.0


In [9]:
data_2.describe()

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16
count,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0
mean,1.404635,0.149574,0.351891,0.073788,0.070725,0.089371,0.060069,0.041955,0.032898,0.046883,0.04089,0.061135,0.070059,0.039558,0.0939,0.170218,0.01878
std,0.617743,0.356677,0.477592,0.261443,0.256381,0.285298,0.237631,0.2005,0.178382,0.211403,0.198048,0.239593,0.255263,0.194931,0.291709,0.380779,0.135756
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0


---
- From the above descriptive result, we can see there is something wrong with the `Issue15` column
---

In [10]:
# fix the `Issue15` column
data_2.replace({'Issue15': {2: 1}}, inplace=True)
data_2.describe()

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16
count,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0,7508.0
mean,1.404635,0.149574,0.351891,0.073788,0.070725,0.089371,0.060069,0.041955,0.032898,0.046883,0.04089,0.061135,0.070059,0.039558,0.0939,0.168354,0.01878
std,0.617743,0.356677,0.477592,0.261443,0.256381,0.285298,0.237631,0.2005,0.178382,0.211403,0.198048,0.239593,0.255263,0.194931,0.291709,0.374205,0.135756
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
data_2['Issue_sum'] = data_2.apply(lambda x: 
                                   x['Issue1'] + x['Issue2'] + x['Issue3'] + x['Issue4'] + 
                                   x['Issue5'] + x['Issue6'] + x['Issue7'] + x['Issue8'] + 
                                   x['Issue9'] + x['Issue10'] + x['Issue11'] + x['Issue12'] + 
                                   x['Issue13'] + x['Issue14'] + x['Issue15'] + x['Issue16'], 
                                   axis=1)
data_2.head()

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16,Issue_sum
0,1,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0
1,2,0.0,0.0,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1.0
2,1,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0
3,1,0.0,1.0,0.0,0.0,1.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,3.0
4,2,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,1.0


---
- Do the filter
---

In [12]:
data_3 = data_2[data_2['Issue_sum'] > 1.]
data_3.shape

(3066, 18)

### 3. Calculate the combination

In [13]:
data_3 = data_3.reset_index(drop=True)
data_3

Unnamed: 0,attitude,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16,Issue_sum
0,1,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0
1,1,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0
2,1,0.0,1.0,0.0,0.0,1.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,3.0
3,1,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,1.0,1.0,1,0.0,0.0,0,0.0,0.0,4.0
4,1,1.0,1.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,1,1.0,0.0,0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3061,2,1.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0
3062,2,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,1.0,0,0.0,0.0,2.0
3063,0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,2.0
3064,2,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,2.0


In [14]:
all_edges = list()
for idx1 in range(0, data_3.shape[0]):
    edges = list()
    for idx2 in range(1, 17):
        if int(data_3.loc[idx1, 'Issue' + str(idx2)]) != 0:
            edges.append('Issue' + str(idx2))
    all_edges.append(edges)
    data_3.loc[idx1, 'edges'] = str(edges)
    data_3.loc[idx1, 'edgesNum'] = len(edges)

data_3 = data_3[['attitude', 'Issue_sum', 'edges', 'edgesNum']]
data_3.head()

Unnamed: 0,attitude,Issue_sum,edges,edgesNum
0,1,2.0,"['Issue1', 'Issue2']",2.0
1,1,2.0,"['Issue1', 'Issue4']",2.0
2,1,3.0,"['Issue2', 'Issue5', 'Issue6']",3.0
3,1,4.0,"['Issue2', 'Issue9', 'Issue10', 'Issue11']",4.0
4,1,5.0,"['Issue1', 'Issue2', 'Issue4', 'Issue11', 'Iss...",5.0


---
- Check the accuracy
---

In [15]:
data_3['check'] = data_3['Issue_sum'] == data_3['edgesNum']
data_3['check'].value_counts()

True    3066
Name: check, dtype: int64

In [16]:
for idx in range(0, data_3.shape[0]):
    temp = list()
    for item in combinations(literal_eval(data_3.loc[idx, 'edges']), 2):
        temp.append(item)
    data_3.loc[idx, 'combination'] = str(temp)
data_3.head()

Unnamed: 0,attitude,Issue_sum,edges,edgesNum,check,combination
0,1,2.0,"['Issue1', 'Issue2']",2.0,True,"[('Issue1', 'Issue2')]"
1,1,2.0,"['Issue1', 'Issue4']",2.0,True,"[('Issue1', 'Issue4')]"
2,1,3.0,"['Issue2', 'Issue5', 'Issue6']",3.0,True,"[('Issue2', 'Issue5'), ('Issue2', 'Issue6'), (..."
3,1,4.0,"['Issue2', 'Issue9', 'Issue10', 'Issue11']",4.0,True,"[('Issue2', 'Issue9'), ('Issue2', 'Issue10'), ..."
4,1,5.0,"['Issue1', 'Issue2', 'Issue4', 'Issue11', 'Iss...",5.0,True,"[('Issue1', 'Issue2'), ('Issue1', 'Issue4'), (..."


### 4. Build matrices based on the `attitude` column

In [18]:
edges_dict = collections.defaultdict(int)
number = 0
for idx in range(0, data_3.shape[0]):
    # set the attitude category here  👇
    if data_3.loc[idx, 'attitude'] == 0:
        number += 1
        for item in literal_eval(data_3.loc[idx, 'combination']):
            edges_dict[item] += 1
print('The number of rows with the corresponding attitude is: %s'%(number))

The number of rows with the corresponding attitude is: 305


In [19]:
G = nx.MultiGraph()
for item in sorted(dict(edges_dict).items(), key=lambda x: x[1], reverse=True):
    G.add_edge(item[0][0], item[0][1], weight=item[1])
print(nx.to_numpy_matrix(G, nodelist = ['Issue' + str(idx) for idx in range(1, 17)]))

[[ 0. 40.  1.  3. 18. 10.  5.  5.  5.  5.  9.  5. 11. 10. 39.  4.]
 [40.  0.  3.  3. 12. 13.  4. 11. 17.  8. 15.  7. 11. 11. 40.  2.]
 [ 1.  3.  0.  0.  1.  0.  3.  3.  2.  0.  0.  1.  0.  2.  2.  0.]
 [ 3.  3.  0.  0. 14.  9. 17.  6.  3.  2.  2.  4.  3. 14.  9.  1.]
 [18. 12.  1. 14.  0.  8. 15.  9.  6.  6.  5.  6.  7. 21. 24.  3.]
 [10. 13.  0.  9.  8.  0. 15. 10.  3.  1.  1.  5.  8. 10. 12.  5.]
 [ 5.  4.  3. 17. 15. 15.  0.  9.  2.  2.  1.  2.  4. 11.  8.  1.]
 [ 5. 11.  3.  6.  9. 10.  9.  0.  0.  1.  1.  4.  5.  4.  9.  0.]
 [ 5. 17.  2.  3.  6.  3.  2.  0.  0.  3.  4.  3.  7.  3. 13.  2.]
 [ 5.  8.  0.  2.  6.  1.  2.  1.  3.  0.  2.  0.  3.  6.  5.  1.]
 [ 9. 15.  0.  2.  5.  1.  1.  1.  4.  2.  0.  6.  1.  8.  7.  2.]
 [ 5.  7.  1.  4.  6.  5.  2.  4.  3.  0.  6.  0.  2.  4. 17.  2.]
 [11. 11.  0.  3.  7.  8.  4.  5.  7.  3.  1.  2.  0.  3. 13.  1.]
 [10. 11.  2. 14. 21. 10. 11.  4.  3.  6.  8.  4.  3.  0.  9.  2.]
 [39. 40.  2.  9. 24. 12.  8.  9. 13.  5.  7. 17. 13.  9.  0. 

### 5. Export the matrix

In [20]:
matrix_df = pd.DataFrame(nx.to_numpy_matrix(G, nodelist = ['Issue' + str(idx) for idx in range(1, 17)]), 
                         columns = ['Issue' + str(idx) for idx in range(1, 17)], 
                         index = ['Issue' + str(idx) for idx in range(1, 17)])
matrix_df

Unnamed: 0,Issue1,Issue2,Issue3,Issue4,Issue5,Issue6,Issue7,Issue8,Issue9,Issue10,Issue11,Issue12,Issue13,Issue14,Issue15,Issue16
Issue1,0.0,40.0,1.0,3.0,18.0,10.0,5.0,5.0,5.0,5.0,9.0,5.0,11.0,10.0,39.0,4.0
Issue2,40.0,0.0,3.0,3.0,12.0,13.0,4.0,11.0,17.0,8.0,15.0,7.0,11.0,11.0,40.0,2.0
Issue3,1.0,3.0,0.0,0.0,1.0,0.0,3.0,3.0,2.0,0.0,0.0,1.0,0.0,2.0,2.0,0.0
Issue4,3.0,3.0,0.0,0.0,14.0,9.0,17.0,6.0,3.0,2.0,2.0,4.0,3.0,14.0,9.0,1.0
Issue5,18.0,12.0,1.0,14.0,0.0,8.0,15.0,9.0,6.0,6.0,5.0,6.0,7.0,21.0,24.0,3.0
Issue6,10.0,13.0,0.0,9.0,8.0,0.0,15.0,10.0,3.0,1.0,1.0,5.0,8.0,10.0,12.0,5.0
Issue7,5.0,4.0,3.0,17.0,15.0,15.0,0.0,9.0,2.0,2.0,1.0,2.0,4.0,11.0,8.0,1.0
Issue8,5.0,11.0,3.0,6.0,9.0,10.0,9.0,0.0,0.0,1.0,1.0,4.0,5.0,4.0,9.0,0.0
Issue9,5.0,17.0,2.0,3.0,6.0,3.0,2.0,0.0,0.0,3.0,4.0,3.0,7.0,3.0,13.0,2.0
Issue10,5.0,8.0,0.0,2.0,6.0,1.0,2.0,1.0,3.0,0.0,2.0,0.0,3.0,6.0,5.0,1.0


In [21]:
# change the name each time
matrix_df.to_csv('./data_CN/attitude_0_matrix.csv', header=True, index=True, sep=',')