In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re, os, datetime
import json
from datetime import date
pd.set_option('max_colwidth',240)

## READ the file

In [11]:
dtypes = {'original_tweet_id': 'object', 'response_to_status_id': 'object', 'tweet_id': object, 'user_id': 'object'}

In [12]:
df = pd.read_csv("../data_samples/tweets/2017-01-15-klimaat_tweets.csv", dtype=dtypes, parse_dates = ["date_time"])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4012 entries, 0 to 4011
Data columns (total 19 columns):
Unnamed: 0             4012 non-null int64
index                  4012 non-null int64
tweet_id               4012 non-null object
original_tweet_id      2305 non-null object
original_tweet_user    2305 non-null object
date_time              4012 non-null datetime64[ns]
user_id                4012 non-null object
username               4012 non-null object
followers              4012 non-null int64
friends                4012 non-null int64
user_description       3544 non-null object
statuses               4012 non-null int64
text                   4012 non-null object
nr_likes               4012 non-null int64
users_likes            3994 non-null object
nr_RT                  4012 non-null int64
entities               4012 non-null object
nr_responses           227 non-null float64
users_responses        227 non-null object
dtypes: datetime64[ns](1), float64(1), int64(7), object(1

# PREPARE for analysis

In [14]:
data = df.copy().drop("Unnamed: 0", axis = 1).replace(r'', np.nan, regex=True)

## deduplicate

In [15]:
dataset = data.drop_duplicates(subset="tweet_id", keep="last") #keep the last occurence of a tweet

In [16]:
len(dataset)

4012

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4012 entries, 0 to 4011
Data columns (total 18 columns):
index                  4012 non-null int64
tweet_id               4012 non-null object
original_tweet_id      2305 non-null object
original_tweet_user    2305 non-null object
date_time              4012 non-null datetime64[ns]
user_id                4012 non-null object
username               4012 non-null object
followers              4012 non-null int64
friends                4012 non-null int64
user_description       3544 non-null object
statuses               4012 non-null int64
text                   4012 non-null object
nr_likes               4012 non-null int64
users_likes            3994 non-null object
nr_RT                  4012 non-null int64
entities               4012 non-null object
nr_responses           227 non-null float64
users_responses        227 non-null object
dtypes: datetime64[ns](1), float64(1), int64(6), object(10)
memory usage: 595.5+ KB


In [18]:
len(dataset[~dataset.text.str.contains("RT", na=False)])

1684

## extract mentions

extraction of *mentions* was added too late and required a dirty solution

In [19]:
def extract_mentions(x):
      
    try:
        m = [n.replace("screen_name': '","") for n in re.findall("screen_name': '[A-Za-z0-9_]*", x)]
        return ";".join(m)
    except (AttributeError, TypeError):
        return np.nan

In [20]:
dataset["mentions"] = dataset["entities"].apply(lambda x: extract_mentions(x))

# CREATE a dataset for Gephi

Gephi needs nework data in a certain format. This required a lot of cutting and glueing together. See [Gephi reference for csv format](https://gephi.org/users/supported-graph-formats/csv-format/)

In [21]:
def splitit(df, column, colname):
    
    s = df[column].str.split(';').apply(pd.Series, 1).stack().reset_index(level=1, drop=True)
    s.name = colname
    
    #print(s)
    return s.replace(r'[?]?', np.nan, regex=True)

### Likes & replies

In [22]:
d = dataset[["username", "users_likes", "users_responses"]]
n = pd.DataFrame(d["username"])
n = n.rename(columns = {"username" : "target"})

In [23]:
likes = splitit(d, "users_likes", "source")
replies = splitit(d, "users_responses", "source")

In [24]:
d2 = dataset.loc[dataset["original_tweet_user"].isnull(),["username", "mentions"]] #remove retweets
m = pd.DataFrame(d2["username"])
m = m.rename(columns = {"username" : "source"})

### Mentions

In [25]:
mentions = splitit(d2, "mentions", "target")

### Re-tweets

In [26]:
rt = dataset[["username", "original_tweet_user"]]
rt = rt.rename(columns = { "username" : "source",
                            "original_tweet_user" : "target"})
rt = rt.replace(r'', np.nan, regex=True) # NaN instead of empty cell

### MERGE

In [27]:
network = n.join(likes).append(n.join(replies)).append(m.join(mentions)).append(rt).dropna(axis=0)

## REMOVE tweets that have nothing to do with climate

We hand picked user names that would have a prominent place in the analysis but their tweets were not on topic

In [28]:
eruit = ["terzaketv","stevencraneTV","rivliv","modernezooi","manjureijmer"]

In [29]:
network = network[~network.target.isin(eruit)]

# EXPORT

edges table

In [32]:
network.to_csv("../data_sample/gephi/edges.csv", index=False)

nodes table

In [39]:
pd.DataFrame(dataset["username"].value_counts()).reset_index().to_csv("../data_sample/gephi/nodes.csv",
                                                                      header=["Id","tweets"],
                                                                      index=False)

Use these two sheets in Gephi `Data Laboratory` > `Import Spreadsheet`.<br>Pay attention to the `As table` dropdown. Import first the `edges table` and then the `nodes table` (`tweets` are an integer!).<br>In the `Overview` tab, use the `Force Atlas 2` layout and watch the bubble unfold