# Twitter Konversationen zu einem Thema als Netzwerk untersuchen

- Aus Twitter-Daten kann man besonders gut Netzwerke basteln.
- Dabei können wir frei definieren,wann eigentlich ein Nutzer mit einem anderen verbunden ist. Die gebräuchlichsten Definitionen sind:
    1. Nutzer A retweetet Nutzer B (RT plotti was für ein super tweet)
    2. Nutzer A erwähnt Nutzer B (Ich geh das so die Straße lang und seh @plotti)
    3. Nutzer A schreibt Nutzer B (@plotti was geht heute)
    4. (Nutzer A folgt Nutzer B (Leider um die Struktur einer Konversationen nicht sooo hilfreich. Außerdem muss man über Twarc recht viele User sammeln um diese Information zu erhalten, es geht aber.))

# Daten Sammeln über Twarc

- https://github.com/DocNow/twarc
- Twarc: A command line tool (and Python library) for archiving Twitter JSON
- Sehr praktisch um Tweets zu einem Stichwort zu sammeln. 
- Man muss eine Twiter app beantragen :(
- ```pip install twarc```
- ```twarc configure```
![Twarc](twarc.png)

## Daten Sammeln 

```twarc search zürich > zürich.json```

In [None]:
import sys
import json
import re
import numpy as np
from datetime import datetime
import pandas as pd  
import networkx as nx

tweetfile = 'zürich.json'

# 1. Kanten erzeugen durch Retweets
- Personen retweeten sich und deswegen erzeugen wir eine Kante zwischen ihnen.

In [None]:
# 1. Export edges from Retweets

fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Time', "Strength"))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if 'retweeted_status' not in tweet:
        continue
    
    userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    userdata = userdata.append(pd.DataFrame([[tweet['retweeted_status']['user']['id_str'],
                                tweet['retweeted_status']['user']['screen_name'],
                                tweet['retweeted_status']['user']['created_at'],
                                tweet['retweeted_status']['user']['profile_image_url_https'],
                                tweet['retweeted_status']['user']['followers_count'],
                                tweet['retweeted_status']['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)                 
    edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['retweeted_status']['user']['id_str'],
                                str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')),1]]
                                , columns=('Source','Target',"Time",'Strength')), ignore_index=True)           

In [None]:
userdata.head()

In [None]:
edges.head()

# 2. Kanten erzeugen durch Mentions
- Personen erwähnen sich und deshalb erzeugen wir eine Kante zwischen den Personen. 

In [None]:
fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Strength'))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if len(tweet['entities']['user_mentions']) == 0:
        continue
    
    for mention in tweet['entities']['user_mentions']:
        userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
        if len(userdata[userdata['Id'].str.contains(mention['id_str'])]) == 0:
            userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                np.nan,
                                np.nan,
                                np.nan,
                                np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
        edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                    mention['id_str'],
                                    str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]
                                    , columns=('Source','Target','Strength')), ignore_index=True)  

# 3. Kanten erzeugen durch gemeinsame Kommunikation
- Personen diskutieren miteinander und deshalb erzeugen wir eine Kante zwischen ihnen. 

In [None]:

fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Strength'))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if tweet['in_reply_to_user_id_str'] is None:
        continue

    userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    if len(userdata[userdata['Id'].str.contains(tweet['in_reply_to_user_id_str'])]) == 0:
            userdata = userdata.append(pd.DataFrame([[tweet['in_reply_to_user_id_str'],
                                tweet['in_reply_to_screen_name'],
                                np.nan,
                                np.nan,
                                np.nan,
                                np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['in_reply_to_user_id_str'],
                                str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]
                                , columns=('Source','Target','Strength')), ignore_index=True)

# Nur jene Kanten behalten die eine gewisse Stärke haben.

In [None]:
strengthLevel = 3  # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.
                   # If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.

edges2 = edges.groupby(['Source','Target'])['Strength'].count()
edges2 = edges2.reset_index()
edges2 = edges2[edges2['Strength'] >= strengthLevel]

In [None]:
len(edges2)

# Daten als Gephi Netzwerk Exportieren

In [None]:
def robust_decode(bs):
    '''Takes a byte string as param and convert it into a unicode one.
First tries UTF8, and fallback to Latin1 if it fails'''
    cr = None
    cr = bs.decode('ascii', 'ignore').encode('ascii')
    return cr

In [None]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
userdata = userdata.sort_values(['Id','followers_count'], ascending=[True, False])
userdata = userdata.drop_duplicates(['Id'], keep='first') 

ids = edges2['Source'].append(edges2['Target']).to_frame()
ids.columns = ['Id']
ids = ids.drop_duplicates()

nodes = pd.merge(ids, userdata, on='Id', how='left')
nodes = nodes.dropna()
nodes["Label"] = nodes["Label"].astype(str)
nodes["Id"] = nodes["Id"].astype(str)


G  = nx.DiGraph(name="zürich")

for i, row in nodes.iterrows():
    G.add_node(robust_decode(row["Id"]), label=robust_decode(row["Label"]))

for i, row in edges2.iterrows():
    G.add_edge(robust_decode(row["Source"]),robust_decode(row["Target"]),weight=row["Strength"])

nx.write_gexf(G,"Zürich.gexf")

# Alternativ als csv speichern für Kumu.io

In [None]:
# Export nodes from the edges and add node attributes for both Sources and Targets.
userdata = userdata.sort_values(['Id','followers_count'], ascending=[True, False])
userdata = userdata.drop_duplicates(['Id'], keep='first') 

ids = edges2['Source'].append(edges2['Target']).to_frame()
ids.columns = ['Id']
ids = ids.drop_duplicates()

nodes = pd.merge(ids, userdata, on='Id', how='left')

# change column names for Kumu import (Run this when using Kumu)
nodes.columns = ['Id', 'Label', 'Date', 'Image', 'followers_count', 'friends_count']
edges2.columns = ['From','To','Strength']

# Export nodes and edges to csv files
nodes.to_csv('nodes.csv', encoding='utf-8', index=False)
edges2.to_csv('edges.csv', encoding='utf-8', index=False)