In [1]:
import pandas as pd
import networkx as nx

### Users

In [2]:
users = pd.read_csv("Users.csv", engine='python', encoding='utf-8', on_bad_lines='skip', index_col=False)

users.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier
0,1,kaggleteam,Kaggle Team,03/24/2011,5
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2
2,381,iguyon,Isabelle,01/29/2010,2
3,383,davidstephan,David Stephan,02/01/2010,0
4,384,gabewarren,Gabe Warren,02/02/2010,0


### User Followers

In [3]:
userFollowers = pd.read_csv("UserFollowers.csv", engine='python', encoding='utf-8', on_bad_lines='skip', index_col=False)

userFollowers.head()

Unnamed: 0,Id,UserId,FollowingUserId,CreationDate
0,64,368,993,05/23/2017
1,122289,368,67483,08/07/2018
2,323230,368,1663522,09/18/2019
3,412502,368,1302389,03/08/2020
4,447841,368,391404,04/27/2020


### User organizations

In [4]:
userOrganizations = pd.read_csv("UserOrganizations.csv", engine='python', index_col=False)

userOrganizations.head()

Unnamed: 0,Id,UserId,OrganizationId,JoinDate
0,1,13209,2,09/13/2016
1,3,993,4,09/23/2016
2,4,699407,4,09/23/2016
3,5,368,4,09/23/2016
4,6,2505,4,09/23/2016


In [5]:
userOrganizations = userOrganizations[["UserId", "OrganizationId", "JoinDate"]]

idx = userOrganizations.groupby(["UserId"])['JoinDate'].transform(max) == userOrganizations['JoinDate']
userOrganizations = userOrganizations[idx]
userOrganizations.sort_values(["UserId"]).head()

Unnamed: 0,UserId,OrganizationId,JoinDate
153,1,322,12/14/2016
16,368,8,09/27/2016
1303,421,642,03/20/2019
1,993,4,09/23/2016
4,2505,4,09/23/2016


### Organizations

In [6]:
organizations = pd.read_csv("Organizations.csv", engine='python', index_col=False)

organizations.head()

Unnamed: 0,Id,Name,Slug,CreationDate,Description
0,2,Facebook,facebook,07/25/2016,Facebook was built to help people connect and ...
1,3,Figure Eight,crowdflower,08/20/2016,[Figure Eight](https://www.figure-eight.com/) ...
2,4,Kaggle,kaggle,09/23/2016,Kaggle is a community of data scientists and d...
3,5,Last-Place Ltd.,lastplaceltd,09/27/2016,
4,6,CWILOC,cwiloc,09/27/2016,[Climatological Database for the World's Ocean...


## Network

In [7]:
network = userFollowers[["UserId", "FollowingUserId", "CreationDate"]]

network = pd.merge(network, users[["Id", "UserName"]], left_on="FollowingUserId", right_on="Id")
network = network.drop(["Id"], axis=1)
network = network.rename(columns={"UserName": "FollowingUserName"})

network = pd.merge(network, users, left_on="UserId", right_on="Id")
network = network[["Id", "UserName",	"FollowingUserId", "FollowingUserName", "CreationDate"]]

network['CreationDate'] = pd.to_datetime(network['CreationDate'], format='%m/%d/%Y')
network

Unnamed: 0,Id,UserName,FollowingUserId,FollowingUserName,CreationDate
0,368,antgoldbloom,993,benhamner,2017-05-23
1,368,antgoldbloom,67483,kmader,2018-08-07
2,368,antgoldbloom,1663522,lavanyashukla01,2019-09-18
3,368,antgoldbloom,1302389,imdevskp,2020-03-08
4,368,antgoldbloom,391404,parulpandey,2020-04-27
...,...,...,...,...,...
1077644,11269228,mlxg777,9835069,asdadasdasdas,2023-01-16
1077645,11736128,manish33221,10552707,muralidharbhusal,2023-01-16
1077646,2125482,shirishkz,7958178,petchznt,2023-01-16
1077647,2760098,gourang019,11618629,arjunsehajpal,2023-01-16


In [8]:
network.to_csv('Network.csv', index=False)

### User Information

In [9]:
userInfo = pd.merge(userOrganizations, organizations[["Id", "Name"]], left_on="OrganizationId", right_on="Id")
userInfo = userInfo.drop(["Id"], axis=1)

userInfo = pd.merge(users, userInfo, left_on="Id", right_on="UserId")
userInfo = userInfo.drop(["UserId"], axis=1)

userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai]
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle


### Network info over the time

In [10]:
def createGraphByYear(globalNetwork, year):
  globalNetwork = globalNetwork[(globalNetwork['CreationDate'] < (str(year) + '-01-01'))]

  gNetworkInfo = globalNetwork[["Id",	"FollowingUserId"]]

  graph = nx.from_pandas_edgelist(gNetworkInfo, source='Id',
                                 target='FollowingUserId',
                                 create_using=nx.DiGraph())

  return graph

In [11]:
def addFeatureByUserId(generalTable, newTable):
  generalInfo = pd.merge(generalTable, newTable, left_on="Id", right_on="UserId")
  generalInfo = generalInfo.drop(["UserId"], axis=1)

  return generalInfo

#### Degree

In [12]:
def addDegree(generalInfo, year, graph):
  newFeature = pd.DataFrame.from_dict(dict(graph.degree), orient='index', columns=["Degree" + str(year)]).reset_index()
  newFeature = newFeature.rename(columns={"index": "UserId"})

  return addFeatureByUserId(generalInfo, newFeature)

##### Indegree

In [13]:
def addInDegree(generalInfo, year, graph):
  newFeature = pd.DataFrame.from_dict(dict(nx.in_degree_centrality(graph)), orient='index', columns=["InDegree" + str(year)]).reset_index()
  newFeature = newFeature.rename(columns={"index": "UserId"})

  return addFeatureByUserId(generalInfo, newFeature)

##### Outdegree

In [14]:
def addOutDegree(generalInfo, year, graph):
  newFeature = pd.DataFrame.from_dict(dict(nx.out_degree_centrality(graph)), orient='index', columns=["OutDegree" + str(year)]).reset_index()
  newFeature = newFeature.rename(columns={"index": "UserId"})

  return addFeatureByUserId(generalInfo, newFeature)

#### Closeness Centrality

In [15]:
def addClosenessCen(generalInfo, year, graph):
  newFeature = pd.DataFrame.from_dict(dict(nx.closeness_centrality(graph)), orient='index', columns=["ClosCen" + str(year)]).reset_index()
  newFeature = newFeature.rename(columns={"index": "UserId"})

  return addFeatureByUserId(generalInfo, newFeature)

#### Betweeness Centrality

In [16]:
def addBetweennessCen(generalInfo, year, graph):
  newFeature = pd.DataFrame.from_dict(dict(nx.betweenness_centrality(graph)), orient='index', columns=["BetwCen" + str(year)]).reset_index()
  newFeature = newFeature.rename(columns={"index": "UserId"})

  return addFeatureByUserId(generalInfo, newFeature)

### Creating and saving the global table

In [17]:
years = range(2018, 2023)

In [18]:
for year in years:
  print(year)
  graphByYear = createGraphByYear(network, year)

  userInfo = addDegree(userInfo, year, graphByYear)
  userInfo = addInDegree(userInfo, year, graphByYear)
  userInfo = addOutDegree(userInfo, year, graphByYear)
  # userInfo = addClosenessCen(userInfo, year, graphByYear)
  # userInfo = addBetweennessCen(userInfo, year, graphByYear)

2018
2019
2020
2021
2022


In [19]:
userInfo.head()

Unnamed: 0,Id,UserName,DisplayName,RegisterDate,PerformanceTier,OrganizationId,JoinDate,Name,Degree2018,InDegree2018,...,OutDegree2019,Degree2020,InDegree2020,OutDegree2020,Degree2021,InDegree2021,OutDegree2021,Degree2022,InDegree2022,OutDegree2022
0,1,kaggleteam,Kaggle Team,03/24/2011,5,322,12/14/2016,Defence Science & Technology Laboratory,14,0.00066,...,0.0,154,0.001179,0.0,185,0.000854,0.0,206,0.000647,0.0
1,368,antgoldbloom,Anthony Goldbloom,01/20/2010,2,8,09/27/2016,Hacker News,97,0.004526,...,2.9e-05,438,0.00333,2.3e-05,693,0.003176,2.3e-05,890,0.002775,1.9e-05
2,421,dremovd,Dmitry Dryomov,02/28/2010,3,642,03/20/2019,OpenDataScience [ods.ai],3,0.000141,...,0.0,30,0.00023,0.0,39,0.00018,0.0,46,0.000138,6e-06
3,993,benhamner,Ben Hamner,05/31/2010,3,4,09/23/2016,Kaggle,404,0.018151,...,0.000289,993,0.007449,0.000153,1256,0.005696,0.000102,1423,0.004395,7.2e-05
4,2505,jeffmoser,Jeff Moser,08/21/2010,5,4,09/23/2016,Kaggle,22,0.000943,...,2.9e-05,43,0.000314,1.5e-05,64,0.000286,9e-06,83,0.000254,6e-06


In [20]:
userInfo.to_csv('UserInfo.csv', index=False)