In [None]:
import os
import requests
import json
import csv
from collections import Counter
from itertools import combinations
import pandas as pd
import re
import unicodedata
import dotenv

In [None]:
dotenv.load_dotenv()

In [None]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise RuntimeError("GITHUB_TOKEN is not set; create a .env with GITHUB_TOKEN=<token>")

In [None]:
github_url = "https://api.github.com/graphql"

In [None]:
session = requests.Session()
session.headers.update(
    {
        "Authorization": f"Bearer {GITHUB_TOKEN}",
        "Content-Type": "application/json"
    }
)

In [4]:
# Simple test query to verify API connection
query = """
{
  viewer {
    login
    name
    repositories(first: 5) {
      nodes {
        name
        url
        description
      }
    }
  }
}
"""

# Send the query
response = session.post(github_url, json={"query": query})
print("Status Code:", response.status_code)
print("Response:", response.json())

Status Code: 200
Response: {'data': {'viewer': {'login': 'Grim2Hot', 'name': 'Alexander Sweet', 'repositories': {'nodes': [{'name': 'NI_Tests', 'url': 'https://github.com/Grim2Hot/NI_Tests', 'description': 'Testing Ground for Implementing Features for NI'}, {'name': 'pytrade_bot', 'url': 'https://github.com/Grim2Hot/pytrade_bot', 'description': 'An exploration into algo-trading using Statistics and (hopefully) Machine Learning in Python.'}, {'name': 'uni_work', 'url': 'https://github.com/KeilanEvans/uni_work', 'description': None}, {'name': 'ML-For-Beginners', 'url': 'https://github.com/Grim2Hot/ML-For-Beginners', 'description': '12 weeks, 26 lessons, 52 quizzes, classic Machine Learning for all'}, {'name': 'ai-agents-for-beginners', 'url': 'https://github.com/Grim2Hot/ai-agents-for-beginners', 'description': '10 Lessons to Get Started Building AI Agents'}]}}}}


In [5]:
print(json.dumps(response.json(), indent=2))

{
  "data": {
    "viewer": {
      "login": "Grim2Hot",
      "name": "Alexander Sweet",
      "repositories": {
        "nodes": [
          {
            "name": "NI_Tests",
            "url": "https://github.com/Grim2Hot/NI_Tests",
            "description": "Testing Ground for Implementing Features for NI"
          },
          {
            "name": "pytrade_bot",
            "url": "https://github.com/Grim2Hot/pytrade_bot",
            "description": "An exploration into algo-trading using Statistics and (hopefully) Machine Learning in Python."
          },
          {
            "name": "uni_work",
            "url": "https://github.com/KeilanEvans/uni_work",
            "description": null
          },
          {
            "name": "ML-For-Beginners",
            "url": "https://github.com/Grim2Hot/ML-For-Beginners",
            "description": "12 weeks, 26 lessons, 52 quizzes, classic Machine Learning for all"
          },
          {
            "name": "ai-agents-for-beg

In [30]:
# Create a request for issues
issue_query = """
{
  repository(owner: "huggingface", name: "transformers") {
    issues(last: 100, states: OPEN) {
      nodes {
        title
        url
        createdAt
        author { 
            ... on User {
                login
                url
                location
            }
        }
      }
    }
  }
}
"""

response = session.post(github_url, json={"query": issue_query})
print("Status Code:", response.status_code)

Status Code: 200


In [31]:
print(json.dumps(response.json(), indent=2) )

{
  "data": {
    "repository": {
      "issues": {
        "nodes": [
          {
            "title": "All vitpose model were brokentransformers/models/vitpose_",
            "url": "https://github.com/huggingface/transformers/issues/42222",
            "createdAt": "2025-11-15T14:56:04Z",
            "author": {
              "login": "lucasjinreal",
              "url": "https://github.com/lucasjinreal",
              "location": "Sanfancisco"
            }
          },
          {
            "title": "`parse_response` should drop EOS",
            "url": "https://github.com/huggingface/transformers/issues/42249",
            "createdAt": "2025-11-17T18:14:56Z",
            "author": {
              "login": "qgallouedec",
              "url": "https://github.com/qgallouedec",
              "location": "Canada"
            }
          },
          {
            "title": "How to fine-tune SAM 3D models?",
            "url": "https://github.com/huggingface/transformers/issues/42344"

In [20]:
locations = []
for i in range(len(response.json()['data']['repository']['issues']['nodes'])):
    issue = response.json()['data']['repository']['issues']['nodes'][i]
    locations.append(issue['author']['location'])

In [21]:
locations

['Sanfancisco',
 'Canada',
 None,
 None,
 'Toulouse - France',
 None,
 'Netherlands',
 'Bhopal,India',
 None,
 None,
 'Berlin, Germany',
 None,
 None,
 None,
 None,
 'San Francisco',
 'Tel Aviv, Israel',
 ' Washington, DC Metro ',
 'china',
 'Paris',
 None,
 None,
 None,
 'Iraq',
 None,
 'Canada',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'Pittsburgh',
 None,
 'New York, U.S.',
 'Singapore, Singapore',
 'London',
 'France',
 None,
 None,
 'Hangzhou China',
 None,
 None,
 None,
 None,
 None,
 'Shanghai, China',
 'Paris (France)',
 'Hangzhou China',
 None,
 None,
 'Seattle, WA',
 None,
 None,
 None,
 None,
 None,
 'Daejeon',
 None,
 None,
 'Shanghai, China',
 None,
 'London, UK',
 None,
 'United States',
 'United States',
 'Gurgaon',
 None,
 None,
 'India',
 None,
 'France',
 'Canada',
 'Rome, Italy',
 'Lille',
 None,
 None,
 'Amsterdam',
 None,
 'Paris, France',
 'Paris, France',
 'Paris, France',
 None,
 None,
 None,
 None,
 'Madison, WI',
 None,
 None,
 'TianJin，China',
 None,

In [22]:
cleaned_locations = [location for location in locations if location is not None]
cleaned_locations

['Sanfancisco',
 'Canada',
 'Toulouse - France',
 'Netherlands',
 'Bhopal,India',
 'Berlin, Germany',
 'San Francisco',
 'Tel Aviv, Israel',
 ' Washington, DC Metro ',
 'china',
 'Paris',
 'Iraq',
 'Canada',
 'Pittsburgh',
 'New York, U.S.',
 'Singapore, Singapore',
 'London',
 'France',
 'Hangzhou China',
 'Shanghai, China',
 'Paris (France)',
 'Hangzhou China',
 'Seattle, WA',
 'Daejeon',
 'Shanghai, China',
 'London, UK',
 'United States',
 'United States',
 'Gurgaon',
 'India',
 'France',
 'Canada',
 'Rome, Italy',
 'Lille',
 'Amsterdam',
 'Paris, France',
 'Paris, France',
 'Paris, France',
 'Madison, WI',
 'TianJin，China',
 'Beijing, China',
 'Taipei, Taiwan',
 'United States']

In [23]:
len(cleaned_locations)

43

In [24]:
loc_set = set(cleaned_locations)
len(loc_set)

34

In [32]:
rate_query = """
{
    rateLimit {
      limit
      cost
      remaining
      resetAt
      used
    }
}
"""

rate_response = session.post(github_url, json={"query": rate_query})
print("Rate Limit Status Code:", rate_response.status_code)
print("Rate Limit Response:", rate_response.json())

Rate Limit Status Code: 200
Rate Limit Response: {'data': {'rateLimit': {'limit': 5000, 'cost': 1, 'remaining': 4906, 'resetAt': '2025-12-30T16:20:09Z', 'used': 94}}}


In [None]:
issue_query = """
{
    repository(owner: "huggingface", name: "transformers") {
    issues(last: 100, states: OPEN) {
      nodes {
        title
        url
        createdAt
        labels(first: 5) {
            nodes {
                name
            }
        }
        author { 
            ... on User {
                login
                url
                location
            }
        }
      }
    }
  }
}
"""

In [6]:
# open issues_data_10k.json file
with open('/home/chef/src/DAT6003/data/raw/issues_data_10k.json', 'r') as f:
    issues_data = json.load(f)

In [8]:
comments = []
no_comments = 0
authors = []

for issue in issues_data:
    if "comments" in issue:
        if len(issue["comments"]["nodes"]) == 0:
            no_comments += 1
        else:
            for comment in issue["comments"]["nodes"]:
                comments.append(comment['bodyText'])
                author = comment.get('author')
                if author and 'login' in author:
                    authors.append(author['login'])

authors_counted = Counter(authors)

print(no_comments)
print(len(comments))

439
43035


In [9]:
authors_counted.most_common(10)

[('LysandreJik', 3406),
 ('patrickvonplaten', 2311),
 ('sgugger', 1634),
 ('stas00', 1403),
 ('thomwolf', 977),
 ('patil-suraj', 885),
 ('julien-c', 770),
 ('NielsRogge', 744),
 ('sshleifer', 674),
 ('BramVanroy', 512)]

In [13]:
# Pull out nodes from the dataset
# Assumption that we can't infer direction of conversation due to lack of reply structure
# All replies are contained within an issue thread.
nodes = []

for issue in issues_data:
    if issue['author'] == {} or issue['author'] is None:
        continue
    else:
        nodes.append(issue['author']['login'])

    if "comments" in issue and len(issue["comments"]["nodes"]) > 0:
        for comment in issue["comments"]["nodes"]:
            if comment['author'] == {}:
                continue
            author = comment.get('author')
            if author and 'login' in author:
                nodes.append(author['login'])

nodes = set(nodes)
len(nodes)

9384

In [15]:
# Extrapolate edges from the dataset
# The assumption is that any users commenting on the same issue are connected to each other.
# Issue authors are considered 'conversation starters' and are therefore also involved in the connections.
edge_weights = Counter()

for issue in issues_data:
    participants = set()

    # get the author of the issue
    if issue['author'] == {} or issue['author'] is None:
        continue
    issue_author = issue['author'].get('login')
    participants.add(issue_author)

    # get the commenters on the issue
    if "comments" in issue and len(issue["comments"]["nodes"]) > 0:
        for comment in issue["comments"]["nodes"]:
            author = comment.get('author')
            if author and 'login' in author:
                participants.add(author['login'])

    if len(participants) < 2:
        continue

    for u, v in combinations(sorted(participants), 2):
        edge_weights[(u, v)] += 1

edges = [(u, v, w) for (u, v), w in edge_weights.items()]

len(edges), edges[:5]

(34216,
 [('ZhaoyueCheng', 'abeljim', 1),
  ('ZhaoyueCheng', 'ethanjperez', 1),
  ('ZhaoyueCheng', 'thomwolf', 1),
  ('abeljim', 'ethanjperez', 1),
  ('abeljim', 'thomwolf', 2)])

In [17]:
nodes_df = pd.DataFrame(list(nodes), columns=['id'])
nodes_df.to_csv('/home/chef/src/DAT6003/data/network/nodes.csv', index=False)

In [18]:
edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])
edges_df["Type"] = "Undirected"
edges_df.to_csv('/home/chef/src/DAT6003/data/network/edges.csv', index=False)

In [19]:
# Get the locations of authors to see what needs cleaning
author_locations = []

for issue in issues_data:
    if issue['author'] == {} or issue['author'] is None:
        continue
    else:
        author_locations.append(issue['author']['location'])

    if "comments" in issue and len(issue["comments"]["nodes"]) > 0:
        for comment in issue["comments"]["nodes"]:
            if comment['author'] == {}:
                continue
            author = comment.get('author')
            if author and 'location' in author:
                author_locations.append(author['location'])

author_locations = set(author_locations)
print(len(author_locations))

1675


In [21]:
# write out author locations to a csv for cleaning
with open('/home/chef/src/DAT6003/data/raw/author_locations.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['location'])
    for location in author_locations:
        writer.writerow([location])

In [23]:
# Normalise Unicode, remove emojis, but keep non-latin script
df = pd.read_csv('/home/chef/src/DAT6003/data/raw/author_locations.csv')

df["location_raw"] = df["location"].astype(str)



In [25]:
EMOJI_RE = re.compile(
    "["                       
    "\U0001F300-\U0001FAFF"   
    "\U00002600-\U000027BF"   
    "\U0001F1E0-\U0001F1FF"   
    "]+",
    flags=re.UNICODE
)

SEP_RE = re.compile(r"[|;/•·]+")
MULTISPACE_RE = re.compile(r"\s+")
TRAILING_PUNCT_RE = re.compile(r"^[,\.\-\s]+|[,\.\-\s]+$")

def normalise_text(s: str) -> str:
    if s == "":
        return ""
    
    if s is None:
        return ""
    s = str(s)

    # Unicode normalisation
    s = unicodedata.normalize("NFKC", s)

    # Remove emoji / pictographs
    s = EMOJI_RE.sub(" ", s)

    # Replace separators with commas
    s = SEP_RE.sub(",", s)

    # Replace newlines/tabs with spaces
    s = s.replace("\n", " ").replace("\r", " ").replace("\t", " ")

    # Strip quotes
    s = s.replace('"', "").replace("'", "")

    # Collapse whitespace
    s = MULTISPACE_RE.sub(" ", s).strip()

    # Trim leading/trailing punctuation/spaces
    s = TRAILING_PUNCT_RE.sub("", s).strip()

    return s

df['location_norm'] = df['location_raw'].fillna("").apply(normalise_text).str.lower()

In [26]:
df.head()

Unnamed: 0,location,location_raw,location_norm
0,China.Shanghai,China.Shanghai,china.shanghai
1,"Pangyo, Korea","Pangyo, Korea","pangyo, korea"
2,Tel Aviv,Tel Aviv,tel aviv
3,WH,WH,wh
4,"Live Oak, FL","Live Oak, FL","live oak, fl"


In [27]:
len(df)

1675

In [29]:
MISSING_SET = {
    "", "none", "null", "nan", "n/a", "na", "unknown", "unspecified", "undefined", "-"
}

# Patterns that usually indicate not-a-location (add as you find them)
NONGEO_PATTERNS = [
    r"\bremote\b",
    r"\bearth\b",
    r"\bmars\b",
    r"\bmilky way\b",
    r"\bthe cloud\b",
    r"\binternet\b",
    r"\bgithub\b",
    r"\bonline\b",
    r"\bhome\b",
    r"\bbasement\b",
    r"\bworldwide\b",
    r"\beverywhere\b",
    r"\banywhere\b",
    r"\bunder your bed\b",
]

NONGEO_RE = re.compile("|".join(NONGEO_PATTERNS), flags=re.IGNORECASE)

def classify_basic(s: str) -> str:
    if s is None:
        return "missing"
    s = str(s).strip().lower()
    if s in MISSING_SET:
        return "missing"
    if NONGEO_RE.search(s):
        return "non-geographic"
    return "candidate"

df["location_status"] = df["location_norm"].map(classify_basic)


In [46]:
import pycountry
import geonamescache

In [40]:
gc = geonamescache.GeonamesCache()
print(gc.get_countries())

{'AD': {'geonameid': 3041565, 'name': 'Andorra', 'iso': 'AD', 'iso3': 'AND', 'isonumeric': 20, 'fips': 'AN', 'continentcode': 'EU', 'capital': 'Andorra la Vella', 'areakm2': 468, 'population': 77006, 'tld': '.ad', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '376', 'postalcoderegex': '^(?:AD)*(\\d{3})$', 'languages': 'ca', 'neighbours': 'ES,FR'}, 'AE': {'geonameid': 290557, 'name': 'United Arab Emirates', 'iso': 'AE', 'iso3': 'ARE', 'isonumeric': 784, 'fips': 'AE', 'continentcode': 'AS', 'capital': 'Abu Dhabi', 'areakm2': 82880, 'population': 9630959, 'tld': '.ae', 'currencycode': 'AED', 'currencyname': 'Dirham', 'phone': '971', 'postalcoderegex': '^\\d{5}-\\d{5}$', 'languages': 'ar-AE,fa,en,hi,ur', 'neighbours': 'SA,OM'}, 'AF': {'geonameid': 1149361, 'name': 'Afghanistan', 'iso': 'AF', 'iso3': 'AFG', 'isonumeric': 4, 'fips': 'AF', 'continentcode': 'AS', 'capital': 'Kabul', 'areakm2': 647500, 'population': 37172386, 'tld': '.af', 'currencycode': 'AFN', 'currencyname': 'Afgha

In [51]:
tmp = gc.get_cities()
tmp

{'3040051': {'geonameid': 3040051,
  'name': 'les Escaldes',
  'latitude': 42.50729,
  'longitude': 1.53414,
  'countrycode': 'AD',
  'population': 15853,
  'timezone': 'Europe/Andorra',
  'admin1code': '08',
  'alternatenames': ["Ehskal'des-Ehndzhordani",
   'Escaldes',
   'Escaldes-Engordany',
   'Les Escaldes',
   'esukarudesu=engorudani jiao qu',
   'lai sai si ka er de-en ge er da',
   'Эскальдес-Энджордани',
   'エスカルデス＝エンゴルダニ教区',
   '萊塞斯卡爾德-恩戈爾達',
   '萊塞斯卡爾德－恩戈爾達']},
 '3041563': {'geonameid': 3041563,
  'name': 'Andorra la Vella',
  'latitude': 42.50779,
  'longitude': 1.52109,
  'countrycode': 'AD',
  'population': 20430,
  'timezone': 'Europe/Andorra',
  'admin1code': '07',
  'alternatenames': ['ALV',
   'Ando-la-Vyey',
   'Andora',
   'Andora la Vela',
   'Andora la Velja',
   "Andora lja Vehl'ja",
   'Andoro Malnova',
   'Andorra',
   'Andorra Tuan',
   'Andorra a Vella',
   'Andorra la Biella',
   'Andorra la Vella',
   'Andorra la Vielha',
   'Andorra-a-Velha',
   "Andorra-

In [30]:
try:
    COUNTRY_LOOKUP = {}
    for c in pycountry.countries:
        COUNTRY_LOOKUP[c.name.lower()] = c.alpha_2
        if hasattr(c, "official_name"):
            COUNTRY_LOOKUP[c.official_name.lower()] = c.alpha_2
    # Common aliases
    COUNTRY_LOOKUP.update({
        "uk": "GB", "u.k.": "GB", "united kingdom": "GB", "great britain": "GB", "britain": "GB",
        "usa": "US", "u.s.a.": "US", "u.s.": "US", "united states": "US", "united states of america": "US",
        "russia": "RU", "south korea": "KR", "korea": "KR", "north korea": "KP",
        "iran": "IR", "viet nam": "VN",
        "taiwan": "TW",  # pycountry has Taiwan as "Taiwan, Province of China" in some datasets
    })
except Exception:
    COUNTRY_LOOKUP = {
        "united kingdom": "GB", "uk": "GB", "britain": "GB",
        "united states": "US", "usa": "US",
        "canada": "CA", "france": "FR", "germany": "DE",
        "china": "CN", "japan": "JP", "india": "IN",
        "australia": "AU", "spain": "ES", "italy": "IT",
    }


In [31]:
US_STATE_TO_US = {
    "al","ak","az","ar","ca","co","ct","de","fl","ga","hi","ia","id","il","in","ks","ky","la","ma","md",
    "me","mi","mn","mo","ms","mt","nc","nd","ne","nh","nj","nm","nv","ny","oh","ok","or","pa","ri","sc",
    "sd","tn","tx","ut","va","vt","wa","wi","wv","wy","dc"
}

CA_PROV_TO_CA = {"on","qc","bc","ab","mb","sk","ns","nb","nl","pe","nt","nu","yt"}

US_STATE_NAMES = {
    "alabama",
    "alaska",
    "arizona",
    "arkansas",
    "california",
    "colorado",
    "connecticut",
    "delaware",
    "florida",
    "georgia",
    "hawaii",
    "idaho",
    "illinois",
    "indiana",
    "iowa",
    "kansas",
    "kentucky",
    "louisiana",
    "maine",
    "maryland",
    "massachusetts",
    "michigan",
    "minnesota",
    "mississippi",
    "missouri",
    "montana",
    "nebraska",
    "nevada",
    "new hampshire",
    "new jersey",
    "new mexico",
    "new york",
    "north carolina",
    "north dakota",
    "ohio",
    "oklahoma",
    "oregon",
    "pennsylvania",
    "rhode island",
    "south carolina",
    "south dakota",
    "tennessee",
    "texas",
    "utah",
    "vermont",
    "virginia",
    "washington",
    "west virginia",
    "wisconsin",
    "wyoming",
    "district of columbia"
}

CA_PROV_NAMES = {    
    "alberta",
    "british columbia",
    "manitoba",
    "new brunswick",
    "newfoundland and labrador",
    "nova scotia",
    "ontario",
    "prince edward island",
    "quebec",
    "saskatchewan",
    "northwest territories",
    "nunavut",
    "yukon"
}

In [None]:
def city_to_country_safe(city_name, city_index):
    """
    city_index: dict[str, set[str]]
        e.g. {"berlin": {"DE"}, "paris": {"FR", "US"}}
    """
    city = city_name.lower().strip()

    for country_id in city_index:
        if city in country_id['alternatenames']:
            name = country_id['name']

    if not countries:
        return None            # not found
    if len(countries) == 1:
        return next(iter(countries))
    return None                # ambiguous → drop


In [55]:
g = gc.get_countries_by_names()
g

{'Andorra': {'geonameid': 3041565,
  'name': 'Andorra',
  'iso': 'AD',
  'iso3': 'AND',
  'isonumeric': 20,
  'fips': 'AN',
  'continentcode': 'EU',
  'capital': 'Andorra la Vella',
  'areakm2': 468,
  'population': 77006,
  'tld': '.ad',
  'currencycode': 'EUR',
  'currencyname': 'Euro',
  'phone': '376',
  'postalcoderegex': '^(?:AD)*(\\d{3})$',
  'languages': 'ca',
  'neighbours': 'ES,FR'},
 'United Arab Emirates': {'geonameid': 290557,
  'name': 'United Arab Emirates',
  'iso': 'AE',
  'iso3': 'ARE',
  'isonumeric': 784,
  'fips': 'AE',
  'continentcode': 'AS',
  'capital': 'Abu Dhabi',
  'areakm2': 82880,
  'population': 9630959,
  'tld': '.ae',
  'currencycode': 'AED',
  'currencyname': 'Dirham',
  'phone': '971',
  'postalcoderegex': '^\\d{5}-\\d{5}$',
  'languages': 'ar-AE,fa,en,hi,ur',
  'neighbours': 'SA,OM'},
 'Afghanistan': {'geonameid': 1149361,
  'name': 'Afghanistan',
  'iso': 'AF',
  'iso3': 'AFG',
  'isonumeric': 4,
  'fips': 'AF',
  'continentcode': 'AS',
  'capital':

In [32]:
def infer_country_from_tokens(tokens):
    for token in tokens:
        token = token.strip().lower()
        if token in COUNTRY_LOOKUP:
            return COUNTRY_LOOKUP[token]
        if token in US_STATE_TO_US or token in US_STATE_NAMES:
            return "US"
        if token in CA_PROV_TO_CA or token in CA_PROV_NAMES:
            return "CA"
    return None

In [33]:
COMMA_SPLIT_RE = re.compile(r"\s*,\s*")

def extract_country_code(loc: str):
    if not loc:
        return None

    s = loc.lower().strip()

    # quick wins: check full string against lookup
    if s in COUNTRY_LOOKUP:
        return COUNTRY_LOOKUP[s]

    # check comma-separated parts
    parts = [p.strip() for p in COMMA_SPLIT_RE.split(s) if p.strip()]
    if not parts:
        return None

    # try last part as country
    last = parts[-1]
    if last in COUNTRY_LOOKUP:
        return COUNTRY_LOOKUP[last]

    # try any part as country (handles "toulouse - france" after your separator cleanup)
    for p in parts:
        if p in COUNTRY_LOOKUP:
            return COUNTRY_LOOKUP[p]

    # US state / CA province inference from last token
    # Example: "san francisco, ca" => US
    tokens = []
    for part in parts:
        tokens.extend(part.split())

    return infer_country_from_tokens(tokens)

df["country_code"] = None
mask = df["location_status"].eq("candidate")
df.loc[mask, "country_code"] = df.loc[mask, "location_norm"].map(extract_country_code)


In [34]:
# If we got a country_code, it's valid
df.loc[mask & df["country_code"].notna(), "location_status"] = "valid"

# If candidate but still no country_code, ambiguous (not necessarily junk)
df.loc[mask & df["country_code"].isna(), "location_status"] = "ambiguous"

# Optional: map code back to country name (if pycountry available)
def code_to_name(code):
    try:
        import pycountry
        c = pycountry.countries.get(alpha_2=code)
        return c.name if c else None
    except Exception:
        return None

df["country"] = df["country_code"].map(code_to_name)


In [35]:
df.head()

Unnamed: 0,location,location_raw,location_norm,location_status,country_code,country
0,China.Shanghai,China.Shanghai,china.shanghai,ambiguous,,
1,"Pangyo, Korea","Pangyo, Korea","pangyo, korea",valid,KR,"Korea, Republic of"
2,Tel Aviv,Tel Aviv,tel aviv,ambiguous,,
3,WH,WH,wh,ambiguous,,
4,"Live Oak, FL","Live Oak, FL","live oak, fl",valid,US,United States


In [37]:
df.to_csv('/home/chef/src/DAT6003/data/processed/author_locations_processed.csv', index=False)

In [57]:
(int(df['country'].isna().sum()) / len(df)) * 100

41.7910447761194

~42% of the given locations in the data cannot be mapped to countries with certainty. a lot of ambiguitiy makes fuzzy matching a possibility to increase this value.

In [59]:
# Re-get nodes, collecting location from author data now

nodes = []

for issue in issues_data:
    if issue['author'] == {} or issue['author'] is None:
        continue
    else:
        nodes.append((issue['author']['login'], issue['author'].get('location')))

    if "comments" in issue and len(issue["comments"]["nodes"]) > 0:
        for comment in issue["comments"]["nodes"]:
            if comment['author'] == {}:
                continue
            author = comment.get('author')
            if author and 'login' in author:
                nodes.append((author['login'], author.get('location')))

nodes = set(nodes)
len(nodes)
nodes

{('LianaN', 'Spain (Barcelona)'),
 ('aclifton314', None),
 ('HuangZhenyang', None),
 ('shijie-wu', 'Bethesda, MD'),
 ('wasiahmad', 'Santa Clara, CA, USA'),
 ('LostBenjamin', None),
 ('shenkev', None),
 ('punshriv', None),
 ('Sun-SunQian', None),
 ('Wingie', None),
 ('LincLabUCCS', None),
 ('RitaRan', 'Durham, NC'),
 ('thies1006', None),
 ('diego6662', 'Pereira - Colombia'),
 ('alessandrobessi', 'Firenze, ITALY'),
 ('Somabhadra', 'Bangalore'),
 ('ciwang', 'Brooklyn'),
 ('Apoorvgarg-creator', None),
 ('wholebuzz', 'Santa Cruz, CA'),
 ('soocheolnoh', None),
 ('aiwarrior-23', None),
 ('seanbenhur', None),
 ('mosheliv', None),
 ('delip', None),
 ('StephennFernandes', 'Goa , INDIA'),
 ('tonyhqanguyen', 'Vancouver, BC'),
 ('ninjalu', None),
 ('pglock', None),
 ('hscspring', None),
 ('jcw521', None),
 ('jamalex', 'San Diego'),
 ('christophschuhmann', 'Hamburg, Germany'),
 ('tospirits', None),
 ('tbaggu', None),
 ('Eric-Wallace', None),
 ('hackyon', 'New York'),
 ('MyBruso', None),
 ('MSMOON', 

In [60]:
nodes_df_updated = pd.DataFrame(nodes, columns=['id', 'location'])

In [62]:
nodes_df_updated

Unnamed: 0,id,location
0,LianaN,Spain (Barcelona)
1,aclifton314,
2,HuangZhenyang,
3,shijie-wu,"Bethesda, MD"
4,wasiahmad,"Santa Clara, CA, USA"
...,...,...
9379,AI678,
9380,hrdxwandg,shanghai
9381,13015517713,广州
9382,Rizhiy,"London, UK"


In [64]:
# Update the location column using the df's mapping for unclean location names
# Create a lookup dictionary from df
location_to_country = df.set_index('location')['country'].to_dict()
nodes_df_updated['country'] = nodes_df_updated['location'].map(location_to_country)

In [65]:
nodes_df_updated.head()

Unnamed: 0,id,location,country
0,LianaN,Spain (Barcelona),Spain
1,aclifton314,,
2,HuangZhenyang,,
3,shijie-wu,"Bethesda, MD",United States
4,wasiahmad,"Santa Clara, CA, USA",United States


In [69]:
n_df = nodes_df_updated.drop('location', axis=1)

In [70]:
n_df.to_csv('/home/chef/src/DAT6003/data/network/nodes_locations.csv', index=False)

In [71]:
comment_issue_data = [
    issue for issue in issues_data 
    if "comments" in issue and len(issue["comments"]["nodes"]) > 0
]

print(f"Filtered out {len(issues_data) - len(comment_issue_data)} issues.")

Filtered out 439 issues.


In [73]:
# re-get nodes from new data
nodes = []

for issue in comment_issue_data:
    if issue['author'] == {} or issue['author'] is None:
        continue
    else:
        nodes.append((issue['author']['login'], issue['author'].get('location')))

    if "comments" in issue and len(issue["comments"]["nodes"]) > 0:
        for comment in issue["comments"]["nodes"]:
            if comment['author'] == {}:
                continue
            author = comment.get('author')
            if author and 'login' in author:
                nodes.append((author['login'], author.get('location')))

nodes = set(nodes)
len(nodes)

9278

In [74]:
nodes_df_updated = pd.DataFrame(nodes, columns=['id', 'location'])
nodes_df_updated.head()

Unnamed: 0,id,location
0,LianaN,Spain (Barcelona)
1,aclifton314,
2,HuangZhenyang,
3,shijie-wu,"Bethesda, MD"
4,wasiahmad,"Santa Clara, CA, USA"


In [75]:
nodes_df_updated['country'] = nodes_df_updated['location'].map(location_to_country)
n_df = nodes_df_updated.drop('location', axis=1)
n_df.to_csv('/home/chef/src/DAT6003/data/network/nodes2.csv', index=False)