In [1]:
#!pip install igraph

In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import time
import igraph as ig

In [3]:
# I import the files
g1 = pd.read_csv('sx-stackoverflow-a2q.txt', delimiter=' ', header=None)
g2 = pd.read_csv('sx-stackoverflow-c2a.txt', delimiter=' ', header=None)
g3 = pd.read_csv('sx-stackoverflow-c2q.txt', delimiter=' ', header=None)

In [4]:
# I create the pandas dataframes
a2q = pd.DataFrame(g1)
c2a = pd.DataFrame(g2)
c2q = pd.DataFrame(g3)

In [5]:
# I give each column a name
a2q.columns = ['Source', 'Target', 'Time']
c2a.columns = ['Source', 'Target', 'Time']
c2q.columns = ['Source', 'Target', 'Time']

In [6]:
a2q.head()
# They obviously are different between each other, but they have the same structure.

Unnamed: 0,Source,Target,Time
0,9,8,1217567877
1,1,1,1217573801
2,13,1,1217606247
3,17,1,1217617639
4,48,2,1217618182


In [7]:
# First of all I take all the rows in which the Source is different from the Target, 
# in order to drop every element that can be connected with itself.
new_c2q = c2q[c2q["Source"]!=c2q["Target"]]
new_a2q = a2q[a2q["Source"]!=a2q["Target"]]
new_c2a = c2a[c2a["Source"]!=c2a["Target"]]

In [8]:
new_c2q.head()

Unnamed: 0,Source,Target,Time
1,242,184,1220733503
2,4213,4946,1220768149
4,2658,1874,1220771891
5,4035,1874,1220773037
6,2257,4489,1220802041


In [9]:
# Initial number of rows
print('new_c2q row count is:', len(new_c2q.index))
print('new_a2q row count is:', len(new_a2q.index))
print('new_c2a row count is:', len(new_c2a.index))

new_c2q row count is: 13664641
new_a2q row count is: 16703594
new_c2a row count is: 17535031


In [10]:
# I trasform into the a2q dataframe 'answers to questions', the seconds into dates
ans = new_a2q.copy()
ans.loc[:, 'Time'] = pd.to_datetime(ans['Time'], unit='s')
ans

Unnamed: 0,Source,Target,Time
0,9,8,2008-08-01 05:17:57
2,13,1,2008-08-01 15:57:27
3,17,1,2008-08-01 19:07:19
4,48,2,2008-08-01 19:16:22
5,17,1,2008-08-01 19:17:19
...,...,...,...
17823520,2773607,1048138,2016-03-06 12:16:21
17823521,6018278,1982354,2016-03-06 12:16:29
17823522,3187183,1404306,2016-03-06 12:17:30
17823523,6022341,1667278,2016-03-06 12:17:40


In [11]:
# I take just the year information
ans['Year'] = pd.DatetimeIndex(ans['Time']).year

In [12]:
# To be safe, I create a copy of my dataframe
answers = ans.copy()
# I arbitrary decide to take just the data between 2013 and 2016
ipt = (answers['Year'] >= 2013) & (answers['Year'] <= 2016)
answers = answers.loc[ipt]
# I delete the 'Time' column because I don't need it anymore
del answers['Time'] 
# This is what I obtain:
answers

Unnamed: 0,Source,Target,Year
7563262,941240,584508,2013
7563263,16007,1176091,2013
7563265,1867379,1896848,2013
7563266,1935971,1109988,2013
7563267,1245254,1939771,2013
...,...,...,...
17823520,2773607,1048138,2016
17823521,6018278,1982354,2016
17823522,3187183,1404306,2016
17823523,6022341,1667278,2016


In [13]:
# Same steps for the 'comment to questions' dataframe
commq = new_c2q.copy()
commq.loc[:, 'Time'] = pd.to_datetime(commq['Time'], unit='s')
commq

Unnamed: 0,Source,Target,Time
1,242,184,2008-09-06 20:38:23
2,4213,4946,2008-09-07 06:15:49
4,2658,1874,2008-09-07 07:18:11
5,4035,1874,2008-09-07 07:37:17
6,2257,4489,2008-09-07 15:40:41
...,...,...,...
20268145,4555367,5308117,2016-03-06 14:05:33
20268146,1177890,4917254,2016-03-06 14:06:56
20268147,3345375,4022289,2016-03-06 14:08:48
20268148,3507137,1801524,2016-03-06 14:09:51


In [14]:
commq['Year'] = pd.DatetimeIndex(commq['Time']).year

In [15]:
c_questions = commq.copy()
ipt1 = (c_questions['Year'] >= 2013) & (c_questions['Year'] <= 2016)
c_questions = c_questions.loc[ipt1]
del c_questions['Time']
c_questions

Unnamed: 0,Source,Target,Year
5486966,1570534,1133188,2013
5486967,861716,904946,2013
5486968,147192,837451,2013
5486969,16007,1176091,2013
5486971,638216,538034,2013
...,...,...,...
20268145,4555367,5308117,2016
20268146,1177890,4917254,2016
20268147,3345375,4022289,2016
20268148,3507137,1801524,2016


In [16]:
# Same steps for the 'comment to answers' dataframe
comma = new_c2a.copy()
comma.loc[:, 'Time'] = pd.to_datetime(comma['Time'], unit='s')
comma

Unnamed: 0,Source,Target,Time
0,1,91,2008-09-06 15:07:10
1,3,91,2008-09-06 15:09:52
2,380,350,2008-09-06 15:42:16
3,4642,2257,2008-09-06 20:51:47
4,4642,1324220,2008-09-06 21:15:46
...,...,...,...
25405369,144088,347727,2016-03-06 14:08:58
25405370,5878860,1330341,2016-03-06 14:09:12
25405371,144088,98207,2016-03-06 14:09:26
25405372,4049257,3816212,2016-03-06 14:09:31


In [17]:
comma['Year'] = pd.DatetimeIndex(comma['Time']).year

In [18]:
c_answers = comma.copy()
ipt2 = (c_answers['Year'] >= 2013) & (c_answers['Year'] <= 2016)
c_answers = c_answers.loc[ipt2]
del c_answers['Time']
c_answers

Unnamed: 0,Source,Target,Year
10001544,1897887,1202025,2013
10001545,1906265,116747,2013
10001546,1936577,1396822,2013
10001547,1497428,1868384,2013
10001548,1897887,1202025,2013
...,...,...,...
25405369,144088,347727,2016
25405370,5878860,1330341,2016
25405371,144088,98207,2016
25405372,4049257,3816212,2016


Now we have three different dataframes:
- 'answers' which has 3 columns and 9520434 rows (it represents answers to questions);
- 'c_questions' which has 3 columns and 9885202 rows (it represents comments to questions);
- 'c_answers' which has 3 columns and 10494445 rows (it represents comments to answers).

## 3 graphs together

# 4. Algorithmic question

In [19]:
# Here I create a Dictionary in order to define the nodes and their neighbours.

n_kids = { "A" : ["B", "C"],
          "B" : ["D", "E", "A"],
          "C" : ["A"],
          "D" : ["B", "E"],
          "E" : ["B", "D"]
}

# Here I create a Function in order to define the edges between the nodes.

def quarrel(n_kids):
    
    # Initialize empty list of edges
    quarrel = [] 
    
    # Initialize empty list for the first dormitory
    s1 = []
    
    # Initialize empty list for the second dormitory
    s2 = []
    
    for node in n_kids:
        for neighbour in n_kids[node]:
            if (neighbour, node) not in quarrel:
                quarrel.append((node, neighbour))
            if (neighbour, node) in quarrel:
                s1.append(node)
                s2.append(neighbour)
                
                # Even if I can take more than once a kid, I just want to know where he goes
                s1 = set(s1) 
                s2 = set(s2)
                
                # Just for taking a look at the output
                print(s1,s2)
                
                return 1
            else:
                break

print(quarrel(n_kids))

{'D'} {'B'}
1
