In [1]:
import pandas as pd
import networkx as nx

### Users

In [2]:
users = pd.read_csv("Users.csv", engine='python', encoding='utf-8', on_bad_lines='skip', index_col=False)

users.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier
0,1,kaggleteam,Kaggle Team,03/24/2011,5
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2
2,381,iguyon,Isabelle,01/29/2010,2
3,383,davidstephan,David Stephan,02/01/2010,0
4,384,gabewarren,Gabe Warren,02/02/2010,0


### User Followers

In [3]:
userFollowers = pd.read_csv("UserFollowers.csv", engine='python', encoding='utf-8', on_bad_lines='skip', index_col=False)

userFollowers.head()

Unnamed: 0,Id,UserId,FollowingUserId,CreationDate
0,64,368,993,05/23/2017
1,122289,368,67483,08/07/2018
2,323230,368,1663522,09/18/2019
3,412502,368,1302389,03/08/2020
4,447841,368,391404,04/27/2020


### User organizations

In [4]:
userOrganizations = pd.read_csv("UserOrganizations.csv", engine='python', index_col=False)

userOrganizations.head()

Unnamed: 0,Id,UserId,OrganizationId,JoinDate
0,1,13209,2,09/13/2016
1,3,993,4,09/23/2016
2,4,699407,4,09/23/2016
3,5,368,4,09/23/2016
4,6,2505,4,09/23/2016


In [5]:
userOrganizations = pd.read_csv("UserOrganizations.csv", engine='python', index_col=False)
userOrganizations.groupby(["UserId", "OrganizationId"])['JoinDate'].transform(max)

0       09/13/2016
1       09/23/2016
2       09/23/2016
3       09/23/2016
4       09/23/2016
           ...    
1671    02/19/2021
1672    07/13/2021
1673    11/23/2021
1674    01/15/2022
1675    11/08/2022
Name: JoinDate, Length: 1676, dtype: object

In [6]:
userOrganizations = userOrganizations[["UserId", "OrganizationId", "JoinDate"]]

idx = userOrganizations.groupby(["UserId"])['JoinDate'].transform(max) == userOrganizations['JoinDate']
userOrganizations = userOrganizations[idx]
userOrganizations.sort_values(["UserId"]).head(50)

Unnamed: 0,UserId,OrganizationId,JoinDate
153,1,322,12/14/2016
16,368,8,09/27/2016
1303,421,642,03/20/2019
1,993,4,09/23/2016
4,2505,4,09/23/2016
551,2987,1144,11/09/2017
7,3258,4,09/23/2016
36,3876,2,10/17/2016
574,4324,1160,11/15/2017
575,4788,1160,11/15/2017


### Organizations

In [7]:
organizations = pd.read_csv("Organizations.csv", engine='python', index_col=False)

organizations.head()

Unnamed: 0,Id,Name,Slug,CreationDate,Description
0,2,Facebook,facebook,07/25/2016,Facebook was built to help people connect and ...
1,3,Figure Eight,crowdflower,08/20/2016,[Figure Eight](https://www.figure-eight.com/) ...
2,4,Kaggle,kaggle,09/23/2016,Kaggle is a community of data scientists and d...
3,5,Last-Place Ltd.,lastplaceltd,09/27/2016,
4,6,CWILOC,cwiloc,09/27/2016,[Climatological Database for the World's Ocean...


## Network

In [8]:
network = userFollowers[["UserId", "FollowingUserId"]]

network = pd.merge(network, users[["Id", "UserName"]], left_on="FollowingUserId", right_on="Id")
network = network.drop(["Id"], axis=1)
network = network.rename(columns={"UserName": "FollowingUserName"})

network = pd.merge(network, users, left_on="UserId", right_on="Id")
network = network[["Id", "UserName",	"FollowingUserId", "FollowingUserName"]]

network

Unnamed: 0,Id,UserName,FollowingUserId,FollowingUserName
0,368,antgoldbloom,993,benhamner
1,368,antgoldbloom,67483,kmader
2,368,antgoldbloom,391404,parulpandey
3,1950,sskiing,993,benhamner
4,1950,sskiing,73703,psilogram
...,...,...,...,...
19440,429807,brunochaves,291946,adriel
19441,430035,rijubhattacharyya,360797,deepuishere
19442,430155,andreiabonfante,209046,allango
19443,431089,lerosen,408505,rferreras


In [9]:
network.to_csv('Network.csv')

### User Information

In [10]:
userInfo = pd.merge(userOrganizations, organizations[["Id", "Name"]], left_on="OrganizationId", right_on="Id")
userInfo = userInfo.drop(["Id"], axis=1)

userInfo = pd.merge(users, userInfo, left_on="Id", right_on="UserId")
userInfo = userInfo.drop(["UserId"], axis=1)

userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai]
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle


### Network info

In [11]:
networkInfo = network[["Id",	"FollowingUserId"]]
networkInfo = networkInfo.rename(columns={"Id": "source", "FollowingUserId": "target"})

graph = nx.from_pandas_edgelist(networkInfo)

#### Degree

In [12]:
userDegrees = pd.DataFrame.from_dict(dict(graph.degree), orient='index', columns=["Degree"]).reset_index()
userDegrees = userDegrees.rename(columns={"index": "UserId"})

userDegrees

Unnamed: 0,UserId,Degree
0,368,75
1,993,149
2,67483,86
3,391404,123
4,1950,29
...,...,...
6941,425570,1
6942,317511,1
6943,395871,1
6944,429569,1


In [13]:
userInfo = pd.merge(userInfo, userDegrees, left_on="Id", right_on="UserId")
userInfo = userInfo.drop(["UserId"], axis=1)

userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name,Degree
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory,12
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News,75
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai],8
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle,149
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle,13


#### Closeness Centrality

In [14]:
userClosenessCen = pd.DataFrame.from_dict(dict(nx.closeness_centrality(graph)), orient='index', columns=["ClosCen"]).reset_index()
userClosenessCen = userClosenessCen.rename(columns={"index": "UserId"})

userClosenessCen

Unnamed: 0,UserId,ClosCen
0,368,0.330305
1,993,0.343211
2,67483,0.321516
3,391404,0.328887
4,1950,0.328469
...,...,...
6941,425570,0.000144
6942,317511,0.000144
6943,395871,0.000192
6944,429569,0.000144


In [15]:
userInfo = pd.merge(userInfo, userClosenessCen, left_on="Id", right_on="UserId")
userInfo = userInfo.drop(["UserId"], axis=1)

userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name,Degree,ClosCen
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory,12,0.27376
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News,75,0.330305
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai],8,0.285685
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle,149,0.343211
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle,13,0.275522


#### Betweeness Centrality

In [16]:
userBetweennessCen = pd.DataFrame.from_dict(dict(nx.betweenness_centrality(graph)), orient='index', columns=["BetwCen"]).reset_index()
userBetweennessCen = userBetweennessCen.rename(columns={"index": "UserId"})

userBetweennessCen

Unnamed: 0,UserId,BetwCen
0,368,0.004223
1,993,0.011314
2,67483,0.009276
3,391404,0.010439
4,1950,0.000486
...,...,...
6941,425570,0.000000
6942,317511,0.000000
6943,395871,0.000000
6944,429569,0.000000


In [17]:
userInfo = pd.merge(userInfo, userBetweennessCen, left_on="Id", right_on="UserId")
userInfo = userInfo.drop(["UserId"], axis=1)

userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name,Degree,ClosCen,BetwCen
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory,12,0.27376,0.001032
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News,75,0.330305,0.004223
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai],8,0.285685,4.4e-05
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle,149,0.343211,0.011314
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle,13,0.275522,0.000271


In [18]:
userInfo.to_csv('UserInfo.csv')