In [None]:
import tweepy
import pandas as pd
import time
import re
import requests
import langtags
from datetime import date
import github
from github import Github
import tqdm
from pygitapi import HubAPI

git = Github("github_token")

In [None]:
client = tweepy.Client(bearer_token="bearer_token",
                       access_token="access_token",
                       access_token_secret="access_token_secret")

### Query for tweets containing the URL "github.com/sponsors/"

In [None]:
tweet_data = []
for tweet in tweepy.Paginator(
        client.search_all_tweets,
        query='url:"github.com/sponsors/"  -is:retweet -is:reply',
        tweet_fields=[
            'id', 'text', 'author_id', 'created_at', 'entities',
            'referenced_tweets', 'lang', 'public_metrics'
        ],
        user_fields=['username'],
        expansions='author_id',
        start_time='2019-05-01T00:00:00Z',
        end_time='2022-04-30T00:00:00Z',
        max_results=50):
    time.sleep(1.5)
    tweet_data.append(tweet)

### Extract the data from the obtained

In [None]:
data = []
urls = []
sponsor_user = []
user_dict = {}
get_sponsor = True
time = []
headers = {
    'user-agent':
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
# Loop through each response object
for response in tweet_data:
    # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
    for user in response.includes['users']:
        user_dict[user.id] = {'username': user.username}
    for tweet in response.data:
        # For each tweet, find the author's information
        author_info = user_dict[tweet.author_id]
        last_update_time = client.get_users_tweets(id=tweet.author_id,
                                                   tweet_fields=['created_at'])
        # Get links for each tweet
        for i in range(len(tweet.entities['urls'])):
            link = tweet.entities['urls'][i]['expanded_url']
            if 'https://github.com/sponsors/' in link:
                sponsor_user.append(
                    link.split('/')[4].split('?')[0].split('#')[0])
                get_sponsor = False
            # If the URL has been shortened, it should be extended
            elif get_sponsor:
                try:
                    short_link = requests.head(link,
                                               allow_redirects=True,
                                               headers=headers)
                except requests.exceptions.RequestException as error:
                    print("Error: ", error)
                if 'https://github.com/sponsors/' in str(short_link.url):
                    sponsor_user.append(
                        str(short_link.url).split('/')[4].split('?')[0])
            get_sponsor = True
            urls.append(link)
        for t in last_update_time.data:
            time.append(t.created_at)
        data.append({
            'Tweet_Url': str(tweet.id),
            'Language': tweet.lang,
            'Time': tweet.created_at,
            'User_Last_Update': time[0],
            'Tweet_User_ID': tweet.author_id,
            'Tweet_Username': author_info['username'],
            'Tweet_Text': tweet.text,
            'Sponsor_Username': sponsor_user,
            '# Retweet': tweet.public_metrics['retweet_count'],
            '# Replys': tweet.public_metrics['reply_count'],
            '# Likes': tweet.public_metrics['like_count'],
            'Links': urls
        })
        sponsor_user = []
        urls = []
        time = []

### Convert to pandas format

In [None]:
df = pd.DataFrame(data)

### Convert to tweet url

In [None]:
df['Tweet_Url'] = 'https://twitter.com/twitter/status/' + df[
    'Tweet_Url'].astype(str)

### Decode the language

In [None]:
lang = []
for a in df['Language']:
    lang.append(langtags.Tag(a).language.description)
df['Language'] = lang

### Get the account type of the Github user

In [None]:
the_sponsor_type = []
usertype = []

for x in df['Sponsor_Username']:
    for y in x:
        try:
            user = git.get_user(y)
            usertype.append(user.type)
        except github.GithubException as e:
            usertype.append(e)
    the_sponsor_type.append(str(usertype))
    usertype = []
df['account_type'] = the_sponsor_type

### Remove the bracket and the quote from the data

In [None]:
def remove_the_bracket(data):
    sponsor_username = []
    bracket = r'[\([{' '})\]]'
    for x in data:
        url = re.sub(bracket, "", str(x))
        url = url.replace("'", "")
        sponsor_username.append(url)
    return sponsor_username

### Remove the bracket and the quote from df['Sponsor_Username']

In [None]:
sponsor_username_without_bracket = remove_the_bracket(df['Sponsor_Username'])
df['Sponsor_Username'] = sponsor_username_without_bracket

### Remove the bracket and the quote from df['account_type']

In [None]:
account_type_without_bracket = remove_the_bracket(df['account_type'])
df['account_type'] = account_type_without_bracket

### Change the letters to lowercase to make it easier to call the userType for query below

In [None]:
df['account_type'] = df['account_type'].str.replace('User', 'user')
df['account_type'] = df['account_type'].str.replace('Organization',
                                                    'organization')

### Get the primaryLanguage of user

In [None]:
import time

git = HubAPI('github_token')
lang = []
lang_use = []

for typeUser, user in zip(df.account_type, df.Sponsor_Username):
    if type(user) == float:
        user = "none"
    query = """
        query{
            userType(login: "username") {
                repositories(first: 100) {
                  totalCount
                  edges {
                    node {
                      primaryLanguage {
                        name
                      }
                    }
                  }
                  pageInfo {
                    endCursor
                    hasNextPage
                    startCursor
                  }
                }
              }
        }""".replace('username', user).replace('userType', typeUser)
    tiers = git.custom_query(query)

    if str(tiers[typeUser]) == "None":
        lang_use.append('None')
    else:
        if len(tiers[typeUser]['repositories']['edges']) == 0:
            lang_use.append('None')
        else:
            for x in tiers[typeUser]['repositories']['edges']:
                if str(x['node']['primaryLanguage']) == "None":
                    lang.append('None')
                else:
                    lang.append(x['node']['primaryLanguage']['name'])

    if str(tiers[typeUser]) != "None":
        while tiers[typeUser]['repositories']['pageInfo']['hasNextPage']:
            endcursor = tiers[typeUser]['repositories']['pageInfo'][
                'endCursor']
            query = """
            query{
                userType(login: "username") {
                    repositories(first: 100,after:"endcursor") {
                      totalCount
                      edges {
                        node {
                          primaryLanguage {
                            name
                          }
                        }
                      }
                      pageInfo {
                        endCursor
                        hasNextPage
                        startCursor
                      }
                    }
                  }
            }""".replace('endcursor',
                         endcursor).replace('username', user).replace(
                             'userType', typeUser)
            tiers = git.custom_query(query)
            for x in tiers[typeUser]['repositories']['edges']:
                if str(x['node']['primaryLanguage']) == "None":
                    lang.append('None')
                else:
                    lang.append(x['node']['primaryLanguage']['name'])

    if len(lang) != 0:
        dfs = pd.DataFrame(lang, columns=['lang'])
        lang_use.append(dfs['lang'].mode()[0])
        lang = []
        dfs = pd.DataFrame()
    time.sleep(2)
df['Primary programming language'] = lang_use

### Save as csv file

In [None]:
df.to_csv('../data/Contribution_activities/All_tweets.csv', index=False)