In [8]:
# Mount Google Drive
from google.colab import drive
drive.mount('/gdrive')

# Import pandas
import pandas as pd
# Used to interact with directories
import os

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


First, we need to import the data set with the harvested tweets.

In [9]:
# Takes all .csv files in a directory and write it into a single Pandas DataFrame
directory_path= "/gdrive/MyDrive/TwitterData_GroupB/Data/Tweets/"

# Create empty DataFrame to fill
tweets = pd.DataFrame()

# iterate over specified directory
for filename in os.listdir(directory_path):
    # set file path
    file_path = os.path.join(directory_path,filename)
    if os.path.isfile(file_path):
      # check if file is a .csv file
      if file_path.endswith('.csv'):
        print(f"Loading: {file_path}.")
        # Read csv into a Pandas DataFrame
        tweets_enkel = pd.read_csv(file_path, encoding='utf-8', engine='python') # specify encoding and engine to not get DtypeWarning
        # Concatenate tweets from single file to Tweets DataFrame
        tweets= pd.concat([tweets, tweets_enkel])



tweets.info()

Loading: /gdrive/MyDrive/TwitterData_GroupB/Data/Tweets/dutchTweets1.csv.
Loading: /gdrive/MyDrive/TwitterData_GroupB/Data/Tweets/dutchTweets2.csv.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1239546 entries, 0 to 610057
Data columns (total 83 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   id                                     1239546 non-null  int64  
 1   conversation_id                        1239546 non-null  int64  
 2   referenced_tweets.replied_to.id        742050 non-null   float64
 3   referenced_tweets.retweeted.id         300169 non-null   float64
 4   referenced_tweets.quoted.id            37537 non-null    float64
 5   author_id                              1239546 non-null  int64  
 6   in_reply_to_user_id                    750370 non-null   float64
 7   in_reply_to_username                   709298 non-null   object 
 8   retweeted_user_id                  

Then we check the first 5 rows to get a sense of how the data looks

In [10]:
tweets.head()

Unnamed: 0,id,conversation_id,referenced_tweets.replied_to.id,referenced_tweets.retweeted.id,referenced_tweets.quoted.id,author_id,in_reply_to_user_id,in_reply_to_username,retweeted_user_id,retweeted_username,...,geo.geo.bbox,geo.geo.type,geo.id,geo.name,geo.place_id,geo.place_type,matching_rules,__twarc.retrieved_at,__twarc.url,__twarc.version
0,1641773164387528704,1641773164387528704,,1.641363e+18,,393582704,,,1.460051e+18,MelissaKampers,...,,,,,,,,2023-03-31T12:03:32+00:00,https://api.twitter.com/2/tweets/search/recent...,2.13.0
1,1641773163523522560,1641773163523522560,,,,1103414720420290562,,,,,...,,,,,,,,2023-03-31T12:03:32+00:00,https://api.twitter.com/2/tweets/search/recent...,2.13.0
2,1641773160310755328,1641773160310755328,,1.641519e+18,,470999569,,,3103261.0,volkskrant,...,,,,,,,,2023-03-31T12:03:32+00:00,https://api.twitter.com/2/tweets/search/recent...,2.13.0
3,1641773156573618176,1641449801366781953,1.64145e+18,,,95203647,2776187000.0,VillaVega5,,,...,,,,,,,,2023-03-31T12:03:32+00:00,https://api.twitter.com/2/tweets/search/recent...,2.13.0
4,1641773152286961664,1641701863153360896,1.641702e+18,,,1705279506,121137100.0,hcssnl,,,...,,,,,,,,2023-03-31T12:03:32+00:00,https://api.twitter.com/2/tweets/search/recent...,2.13.0


Next, since we are only interested in tweets which are geotaged, we filter out all tweets that do not have a geotag. Again, we check the dataset to see how many geotagged tweets we harvested, and how many unique conversations this subset has.

In [11]:
# Dropping All Tweets without geolocation
geoTweets = tweets.dropna(subset=['geo.geo.bbox'])

# Check Dataset Info
print(geoTweets.shape)
print(geoTweets['conversation_id'].nunique())

(20100, 83)
15630


Now that we have assessed the raw dataset, it is time to build our data pipeline. First, I will program a function that will unpack a list of datasets, filter out ungeotagged tweets and other info we do not need and return a dataframe with all the info we need.

In [12]:
# Function to Unpack Geotagged Tweets into 1 Dataframe
def unpackTweets(datasetList, attributeList):
    # Create Empty Dataframe
    resultDF = pd.DataFrame()
    # Create List of Attributes to Keep
    attributes = attributeList
    # Loop Through Dataset List
    for dataset in datasetList:
        # Create Dataframe from Dataset
        df = pd.read_csv(dataset)
        # Drop Tweets without Geolocation
        df = df.dropna(subset=['geo.geo.bbox'])
        # Drop Tweets without Reply Info
        df = df.dropna(subset=['in_reply_to_user_id'])
        # Keep Only Specified Attributes
        df = df.loc[:, attributes]
        # Change Type of Column
        df['in_reply_to_user_id'] = df['in_reply_to_user_id'].astype('int64')
        # Concatenate Dataframes
        resultDF = pd.concat([resultDF, df], axis=0)
    return resultDF

Next, we are going to use this function to filter our original dataset into a dataset we can use for our analysis. The columns we are interested in are: 'author_id', 'text', 'conversation_id', 'in_reply_to_user_id', 'geo.geo.bbox'.

In [13]:
# Make List of Attributes to Keep
attributesToFilter = ['author_id', 'text', 'conversation_id', 'in_reply_to_user_id', 'geo.geo.bbox']
# Make List of Datasets to Unpack
#datasetsToUnpack = ['dutchTweets1.csv']
datasetsToUnpack = [file_path]
# Create Project Dataset
tweetData = unpackTweets(datasetsToUnpack, attributesToFilter)
# Check Dataset Info
print(tweetData.shape)
tweetData.info()

  df = pd.read_csv(dataset)


(7558, 5)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7558 entries, 59 to 610030
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   author_id            7558 non-null   int64 
 1   text                 7558 non-null   object
 2   conversation_id      7558 non-null   int64 
 3   in_reply_to_user_id  7558 non-null   int64 
 4   geo.geo.bbox         7558 non-null   object
dtypes: int64(3), object(2)
memory usage: 354.3+ KB


Next, we want to remove rows that are have a unique 'conversation_id' value, so that we have a dataset with (small) conversations instead of single tweets.

In [14]:
def filterOutUniqueConversations(df, column):
    mask = df[column].duplicated(keep=False)

    return df[mask]

Also, we need to check whether we have enough unique authors

In [15]:
usefulTweets = filterOutUniqueConversations(tweetData, 'conversation_id')
print(usefulTweets.shape)
print(usefulTweets['author_id'].nunique())
print(usefulTweets['in_reply_to_user_id'].nunique())
usefulTweets.head()


(3045, 5)
842
1665


Unnamed: 0,author_id,text,conversation_id,in_reply_to_user_id,geo.geo.bbox
59,219069852,@ONevermind2023 Ik ken ze altijd een softblock...,1644673059264122881,1608857537226358784,"[5.0857107, 52.3846363, 5.6059368, 52.6916409]"
1088,1032894392,"@Ikhounietvanvis @AkbasPinar1980 En dan “Ah, M...",1644726752470618114,1248177989831049216,"[2.9588621, 50.9160108, 3.0884673, 50.9993029]"
1121,324253579,@ingevanwolferen @delibrije In dat geval is mi...,1644687598978252800,228155039,"[4.9702867, 52.0262429, 5.1952468, 52.142137]"
1525,1442171543463682049,@GrunnegsWief Zeker! Ik had ‘m niet beter kunn...,1644440668834525223,1189264143624343552,"[6.0033058, 52.4404911, 6.2105671, 52.5879383]"
2177,1032894392,@Ikhounietvanvis @AkbasPinar1980 Je wilt niet ...,1644726752470618114,1248177989831049216,"[2.9588621, 50.9160108, 3.0884673, 50.9993029]"


Next, we are only interested in the tweets for which we have both the geolocation of the author and the person he/she replies to. Thus, we need to filter the dataframe in a way that we only keep rows for which both the author_id and in_reply_to_user_id are present in the dataframe, although they do not need to be in the same row.

In [16]:
# Function to Filter Out Unlocated Users
def filterOutUnlocatedUsers(df1, df2):
    # Loop Through in_reply_to_user_id Column
    for id in df1['in_reply_to_user_id']:
        # Check if ID is in author_id Column
        if id not in df2['author_id'].values:
            # Drop Row if not
            df1 = df1[df1['in_reply_to_user_id'] != id]
    return df1

# Function to Filter Out Users Who Reply to Themselves
def filterOutSelfReplies(df):
    # Loop Through Rows in Dataframe
    for index, row in df.iterrows():
        # Get in_reply_to_user_id and author_id for Row
        in_reply_to_user_id = row['in_reply_to_user_id']
        author_id = row['author_id']
        # Check if Self-Reply
        if in_reply_to_user_id == author_id:
            # Drop Row if Self-Reply
            df.drop(index, inplace=True)
    return df

Testing the dataset again

In [17]:
usefulTweets = filterOutUnlocatedUsers(usefulTweets, tweetData)
print(usefulTweets.shape)
print(usefulTweets['author_id'].nunique())
print('')
usefulTweets = filterOutSelfReplies(usefulTweets)
print(usefulTweets.shape)
print(usefulTweets['author_id'].nunique())

usefulTweets.head()

(426, 5)
245

(281, 5)
174


Unnamed: 0,author_id,text,conversation_id,in_reply_to_user_id,geo.geo.bbox
2258,940899377543110656,"@wardmarkey @Ikhounietvanvis Kom es langs he, ...",1644726752470618114,1032894392,"[5.2121062, 50.8677279, 5.4145591, 50.9815265]"
2364,1032894392,@AkbasPinar1980 @Ikhounietvanvis Daar ben ik a...,1644726752470618114,940899377543110656,"[2.9588621, 50.9160108, 3.0884673, 50.9993029]"
6601,495914854,@MrsSandvrouw Ik ben het helemaal met je eens!!!,1644682823343677441,707643871379988480,"[6.7559955, 52.1611799, 6.981174, 52.2855112]"
8149,14264386,@erwblo 🤷🏼‍♂️\n\nHou ik me totaal niet mee bezig,1644701882793828352,713333,"[5.7298822, 53.0457272, 5.9496796, 53.2433434]"
8202,713333,@remkusdevries Ik weet niet of het aantal klom...,1644701882793828352,14264386,"[4.7288999, 52.2782266, 5.0792072, 52.4312289]"


Next, we need to add a column for the location of the recipee of the tweet, so that each tweet now has both user IDs of the engagement and the location for each user.

In [18]:
def addReceiverLocation(df1, df2):
    # Rename geo.geo.bbox Column
    df1.rename(columns={'geo.geo.bbox': 'sender_location'}, inplace=True)
    # Create Empty List to Store Receiver Locations
    receiverLocations = []
    # Loop Through Rows in Dataframe
    for index1, row1 in df1.iterrows():
        # Get Receiver ID
        receiverID = row1['in_reply_to_user_id']
        # Get Receiver Location
        recieverLocation = []
        for index2, row2 in df2.iterrows():
            if receiverID == row2['author_id']:
                recieverLocation = row2['geo.geo.bbox']
                break
        # Append Receiver Location to List
        receiverLocations.append(recieverLocation)
    # Add Receiver Locations to Dataframe
    df1['receiver_location'] = receiverLocations
    return df1

Test the dataset again

In [19]:
usefultweets = addReceiverLocation(usefulTweets, tweetData)
# Assess Dataset
print(usefulTweets.shape)
print(usefulTweets['author_id'].nunique())

usefulTweets.head()

(281, 6)
174


Unnamed: 0,author_id,text,conversation_id,in_reply_to_user_id,sender_location,receiver_location
2258,940899377543110656,"@wardmarkey @Ikhounietvanvis Kom es langs he, ...",1644726752470618114,1032894392,"[5.2121062, 50.8677279, 5.4145591, 50.9815265]","[2.9588621, 50.9160108, 3.0884673, 50.9993029]"
2364,1032894392,@AkbasPinar1980 @Ikhounietvanvis Daar ben ik a...,1644726752470618114,940899377543110656,"[2.9588621, 50.9160108, 3.0884673, 50.9993029]","[5.2121062, 50.8677279, 5.4145591, 50.9815265]"
6601,495914854,@MrsSandvrouw Ik ben het helemaal met je eens!!!,1644682823343677441,707643871379988480,"[6.7559955, 52.1611799, 6.981174, 52.2855112]","[6.8263158, 52.6327357, 7.0926136, 52.8729716]"
8149,14264386,@erwblo 🤷🏼‍♂️\n\nHou ik me totaal niet mee bezig,1644701882793828352,713333,"[5.7298822, 53.0457272, 5.9496796, 53.2433434]","[4.7288999, 52.2782266, 5.0792072, 52.4312289]"
8202,713333,@remkusdevries Ik weet niet of het aantal klom...,1644701882793828352,14264386,"[4.7288999, 52.2782266, 5.0792072, 52.4312289]","[5.7298822, 53.0457272, 5.9496796, 53.2433434]"


Now we have developed the first pipeline for the geospatial analysis, so let's create one function that takes in a raw dataset and returns a dataframe that is ready for our geospatial analysis

In [20]:
def dataPipeline(rawData, AttributesToFilter):
    # Unpack Tweets
    rawDf = unpackTweets(rawData, AttributesToFilter)
    # Filter Out Unique Conversations
    df = filterOutUniqueConversations(rawDf, 'conversation_id')
    # Filter Out Unlocated Users
    df = filterOutUnlocatedUsers(df, rawDf)
    # Filter Out Self-Replies
    df = filterOutSelfReplies(df)
    # Add Receiver Location
    df = addReceiverLocation(df, rawDf)
    return df
    


Let's test the data pipeline function

In [21]:
inputData = dataPipeline(datasetsToUnpack, attributesToFilter)

# Assess Dataset
print(inputData.shape)
print(inputData['author_id'].nunique())

inputData.head()

  df = pd.read_csv(dataset)


(281, 6)
174


Unnamed: 0,author_id,text,conversation_id,in_reply_to_user_id,sender_location,receiver_location
2258,940899377543110656,"@wardmarkey @Ikhounietvanvis Kom es langs he, ...",1644726752470618114,1032894392,"[5.2121062, 50.8677279, 5.4145591, 50.9815265]","[2.9588621, 50.9160108, 3.0884673, 50.9993029]"
2364,1032894392,@AkbasPinar1980 @Ikhounietvanvis Daar ben ik a...,1644726752470618114,940899377543110656,"[2.9588621, 50.9160108, 3.0884673, 50.9993029]","[5.2121062, 50.8677279, 5.4145591, 50.9815265]"
6601,495914854,@MrsSandvrouw Ik ben het helemaal met je eens!!!,1644682823343677441,707643871379988480,"[6.7559955, 52.1611799, 6.981174, 52.2855112]","[6.8263158, 52.6327357, 7.0926136, 52.8729716]"
8149,14264386,@erwblo 🤷🏼‍♂️\n\nHou ik me totaal niet mee bezig,1644701882793828352,713333,"[5.7298822, 53.0457272, 5.9496796, 53.2433434]","[4.7288999, 52.2782266, 5.0792072, 52.4312289]"
8202,713333,@remkusdevries Ik weet niet of het aantal klom...,1644701882793828352,14264386,"[4.7288999, 52.2782266, 5.0792072, 52.4312289]","[5.7298822, 53.0457272, 5.9496796, 53.2433434]"


Now we only need to add a function that makes the tweets ready for NLP.