# Exploration and Analysis of GitHub User dataset

## Imports/Configuration

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pip install wordcloud
from wordcloud import WordCloud
from ast import literal_eval
import json
import time
import requests

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", 400)

Note: you may need to restart the kernel to use updated packages.


## Data Filtering/Cleaning

### Loading Dataset
Reading the json from the 10M GitHub Users dataset from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T6ZRJT.

Due to the size of the dataset, it has to be loaded in chunks using the read_json() method from pandas.

In [188]:
chunks = pd.read_json("../data/data.json", lines=True, chunksize=10000)

### Filtering Data
Iterated through each chunk and filtered it by only accepting users that are
1. Not suspicious
2. Of the 'User' type
3. Have a bio
4. Following at least 1 other user
5. Has at least 1 repo

The filtering rules were made to be strict so that the best data can be used for providing recommendations, as there is such a large amount of data to extract from. This will also save time for calculations/adjustments made to the data later on.

Only the required columns were kept in the final dataframe from this process, 'id', 'location', 'company', 'bio', 'follower_list', 'following_list' and 'repo_list'.

In [None]:
count = 0

# Initialise list of dataframes to be kept
chunk_list = []

# Loop through all chunks
for chunk in chunks:
    chunk = chunk.loc[
                        (chunk["following_list"].notna())   
                        & (chunk["repo_list"].notna())
                        & (chunk["is_suspicious"] == False)
                        & (chunk["type"] == "User")
                        & (chunk["bio"].notna())
                        & (chunk["following_list"].map(lambda d: len(d) if d != None else None) > 0)
                        & (chunk["repo_list"].map(lambda d: len(d) if d != None else None) > 0)
                    ]

    # Only keep specified columns
    df_filtered = chunk[["id", "location", "company", "bio", "follower_list", "following_list", "repo_list"]]
    count += 10000
    print(count)
    chunk_list.append(df_filtered)

df = pd.concat(chunk_list)

df.to_csv("../data/filtered_data.csv", encoding='utf-8', index=False)

### Altering Data Columns

In [None]:
for i in df.index:
    repo_list = df.at[i, "repo_list"]
    languages = [d["language"] for d in repo_list]
    languages = [i for i in set(languages) if i is not None]
    df.at[i, 'repo_list'] = languages
    print(i)

df.rename(columns={"repo_list": "languages"}, inplace=True)
df = df.loc[(df["languages"].map(lambda d: len(d)) > 0)]

In [195]:
df = df[df["location"].notna()]

#### Removing absent users from following_list and follower_list

In [5]:
def existing_users(df, col):
    found_users_list = []
    for i in df.index:
        following_list = df.at[i, col]
        users_to_remove = []
        for item in following_list:
            if not df[df["id"] == item].empty:
                found_users_list.append(df[df["id"] == item])
            else:
                users_to_remove.append(item)
        df.at[i, col] = [x for x in following_list if x not in users_to_remove]
                
    return pd.concat(found_users_list)



In [None]:
existing_followed_users = existing_users(df, "following_list")
# existing_following_users = existing_users(df, "follower_list")

In [24]:
df = df.loc[
    (df["following_list"].map(lambda d: len(d)) > 0)
]

In [6]:
df["languages_str"] = df["languages"].transform(lambda x: [f"[lang_{i}]" for i in x])
df["languages_str"] = [' '.join(map(str, l)) for l in df['languages_str']]

df["location_str"] = "[loc_" + df["location"] + "]"
df.loc[df["location_str"].isna(), "location_str"] = ""

df["clean_input"] = df["bio"] + " " + df["languages_str"]

### Extracting Job Titles

In [None]:
f = open('job_titles.json')
job_data = json.load(f)
job_data = job_data["job-titles"]

count = 0

def extract_job(row):
    extracted_list = [str(word).lower() for word in job_data if word in str(row["bio"]).lower()]
    global count
    count += 1
    print(count)
    if len(extracted_list) == 0:
        return None
    else:
        return max(extracted_list, key=len)

df['job'] = df.apply(extract_job, axis=1)

### Lat/Long Columns

In [None]:
locations_dict = {}

In [None]:
count = 0


location_notna_df = df.copy()
location_notna_df = location_notna_df[location_notna_df["location"].notna()]

for i, row in location_notna_df.iterrows():
    count += 1
    if row["location"].lower() not in locations_dict:
        response = requests.get(f"https://geocode.maps.co/search?q={row['location']}")
        if response.status_code == 200 and len(response.json()) > 0:
            res_data = response.json()
            df.at[i,'lat'] = res_data[0]["lat"]
            df.at[i,'lon'] = res_data[0]["lon"]
            locations_dict[row["location"].lower()] = {"lat": res_data[0]["lat"], "lon": res_data[0]["lon"], "new_location": res_data[0]["display_name"]}
            df.at[i,'new_location'] = res_data[0]["display_name"]
        else:
            df.at[i,'lat'] = None
            df.at[i,'lon'] = None
        time.sleep(0.6)
    else:
        print(f"FOUND: {row['location']}")
        df.at[i,'lat'] = locations_dict[row["location"].lower()]["lat"]
        df.at[i,'lon'] = locations_dict[row["location"].lower()]["lon"]
        df.at[i,'new_location'] = locations_dict[row["location"].lower()]["new_location"]

    print(count)

In [26]:
df.to_csv("../data/data.csv", encoding='utf-8', index=False)

### Load cleaned data

In [2]:
df = pd.read_csv("../data/data_3.csv", delimiter=',', converters={"follower_list": pd.eval, "following_list": pd.eval, "languages": literal_eval})

df.columns = (df.columns.str.strip().str.lower()
              .str.replace(' ', '_')
              .str.replace('(', '')
              .str.replace(')', ''))

df["follower_list"] = df["follower_list"].apply(lambda x: x.tolist())
df["following_list"] = df["following_list"].apply(lambda x: x.tolist())

  df.columns = (df.columns.str.strip().str.lower()
  df.columns = (df.columns.str.strip().str.lower()


## Data Analysis

### Wordcloud
Wordcloud generated from bio descriptions of all users. This shows that words like 'Computer Science', 'Developer', 'University' etc. are extremely common between bios which means we probably don't want to recommend users based on these keywords.

In [None]:
text = ""
for i in df["bio"]:
    text += str(i) + " "

plt.figure(figsize=(8, 8))

x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wc = WordCloud(
    width=500,
    height=500,
    background_color="white",
    min_font_size=6,
    repeat=True,
    mask=mask,
)
wc.generate(text)

plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.title(f"Most Used Words", fontsize=20)

### Visualisation User Jobs

In [None]:
number_of_students = len(df[df["job"] == "student"].index)
number_of_professionals = len(df[(df["job"].notna()) & (df["job"] != "student")].index)
number_of_unemployed = len(df.index) - (number_of_students + number_of_professionals)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Student', 'Professional', 'Unemployed']
students = [number_of_students, number_of_professionals, number_of_unemployed]
ax.bar(langs,students)
plt.show()

In [None]:
all_jobs = df["job"].value_counts()


all_jobs = all_jobs[0:20]
labels = all_jobs.index
sizes = all_jobs.values.tolist()

fig, ax = plt.subplots(figsize=(8, 8))

wedges, texts = ax.pie(sizes, labels=labels)
plt.show()


### Plot of User Geographic Locations

In [3]:
map_plot_df = df.copy()
map_plot_df = map_plot_df[map_plot_df["new_location"].notna()]
map_plot_df["country"] = map_plot_df.apply(lambda x: x["new_location"].split(", ")[-1], axis=1)
map_plot_df["follower_count"] = map_plot_df.apply(lambda x: len(x["follower_list"]), axis=1)
map_plot_df["following_count"] = map_plot_df.apply(lambda x: len(x["following_list"]), axis=1)

In [4]:
%pip install plotly
%pip install nbformat 
import plotly.express as px

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
fig = px.scatter_geo(map_plot_df, lat="lat", lon="lon", hover_name="id", size=map_plot_df["following_count"]+1)
fig.update_layout(title="World Map", title_x=0.5)
fig.update_traces(marker=dict(line=dict(width=0), color="#157D9D"))
fig.update_geos(
    showcountries=True,
    countrycolor="Grey",
    resolution=110,
)
fig.show()

## Recommender System

In [60]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer#
# rs_df = df.copy()[["id", "following_list"]][0:500]

In [None]:
existing_users(df, "following_list")

In [9]:
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cdist

def merge(list1, list2):  
    # listnp.nan_to_num(list1, copy=True, nan=0.0, posinf=None, neginf=None)
    # np.nan_to_num(list2, copy=True, nan=0.0, posinf=None, neginf=None)
    merged_list = [(list1[i], list2[i]) for i in range(0, len(list1))]
    return merged_list

lat_lon_values = merge(df["lat"].to_numpy(), df["lon"].to_numpy())
distances_matrix = nan_euclidean_distances(lat_lon_values, lat_lon_values)
np.nan_to_num(distances_matrix, copy=False)
normed_dist = normalize(distances_matrix, norm="l1")




MemoryError: Unable to allocate 5.05 GiB for an array with shape (26033, 26033) and data type float64

### Content-based Filtering

In [112]:
vec = TfidfVectorizer(strip_accents="unicode", stop_words="english", min_df=3)
vecs = vec.fit_transform(df["clean_input"])

In [113]:
def similarity_by_content(sim):
    scores = enumerate(sim)
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    return sorted_scores[0:5]

In [114]:
# Using User 2000

from sklearn.preprocessing import normalize

# def normalize(arr, t_min, t_max):
#     norm_arr = []
#     diff = t_max - t_min
#     diff_arr = max(arr) - min(arr)   
#     for i in arr:
#         temp = (((i - min(arr))*diff)/diff_arr) + t_min
#         norm_arr.append(temp)
#     return norm_arr

# 20% weight for each user
def top_5_similarities(following_list, weight=1):
    arr_list = []
    for user in following_list:
        sim = cosine_similarity(vecs, vecs[user])
        sim[user] = 0
        weighted_sim = (1 * (sim*weight)) # + (0.25 * normed_dist[user])
        arr_list.append(weighted_sim)
    return sum(arr_list)


def get_recent_follows(user):
    following = df.loc[user].following_list
    return [df[df["id"] == i].index[0] for i in following if not df[df["id"] == i].empty]

# user_sim = top_5_similarities(get_recent_follows(16000), 0.2)

# similarity_by_content(user_sim)


[(11415, array([0.23369523])),
 (23452, array([0.23003008])),
 (17489, array([0.22491517])),
 (4245, array([0.22305085])),
 (12414, array([0.21943793]))]

In [34]:
df.iloc[3478]

id                                                                    26869558
location                                                       Berlin, Germany
company                                                                    NaN
bio                                            Full Stack JavaScript Developer
follower_list      [12866783, 26869552, 17891156, 12187795, 6364656, 29039300]
following_list                            [11701, 6364656, 12187795, 12866783]
languages                                                    [JavaScript, CSS]
languages_str                                     [lang_JavaScript] [lang_CSS]
location_str                                             [loc_Berlin, Germany]
clean_input       Full Stack JavaScript Developer [lang_JavaScript] [lang_CSS]
lat                                                                  52.517037
lon                                                                   13.38886
new_location                                        

In [11]:
# df.iloc[2000].following_list
df.iloc[6000].following_list

def get_idx_by_id(user):
    following_list = df.iloc[user].following_list
    return [df[df["id"] == i].index[0] for i in following_list]

print(get_idx_by_id(6000))

[15944, 12014, 7422, 12475, 15616, 6047, 7605, 3403, 1588, 3702]


### Collaborative Filtering

In [24]:
rs_df_exploded = rs_df["following_list"].explode()

In [25]:
cf_df = pd.get_dummies(rs_df_exploded).groupby(level=0).sum()

  cf_df = pd.get_dummies(rs_df_exploded).groupby(level=0).sum()


In [3]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
# Create a MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit the object on the following column and transform it into a sparse matrix
sparse_matrix = mlb.fit_transform(df['following_list'])



In [29]:
test_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "following_list": [[3, 2], [], [1], [5, 3, 1], []]})
sparse_matrix = mlb.fit_transform(test_df['following_list'])

sparse_matrix

array([[0, 1, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 1, 1],
       [0, 0, 0, 0]])

In [99]:

from scipy.sparse import csr_matrix

id_dict = dict(zip(df['id'], range(len(df))))

data = []
for i, row in df.iterrows():
    user_id = row['id']
    following = row['following_list']
    row_index = id_dict[user_id]
    for f in following:
        if f in id_dict:
            col_index = id_dict[f]
            data.append((row_index, col_index, 1))

n_users = len(df)
n_items = len(df)
sparse_matrix = csr_matrix((np.ones(len(data), dtype=np.int32), (np.array([x[0] for x in data]), np.array([x[1] for x in data]))), shape=(n_users, n_items), dtype=np.int32)

In [129]:
def recommend_by_following(userId):
    profile_cs = cosine_similarity(sparse_matrix, sparse_matrix[userId])
    scores = enumerate(profile_cs)
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    sorted_scores=[i for i in sorted_scores if i[0] != userId]
    return sorted_scores[0:5]

# recommend_by_following(6000)

In [130]:
recommend_by_following(100)

[(6871, array([0.40824829])),
 (0, array([0.])),
 (1, array([0.])),
 (2, array([0.])),
 (3, array([0.]))]

In [149]:
def recommend(user):
    user_sim = top_5_similarities(get_recent_follows(user), 0.2)
    collaborative_f = recommend_by_following(user)
    content_f = similarity_by_content(user_sim)
    print(collaborative_f)
    print(content_f)
    [i for i in ]
recommend(7)

[(419, array([0.51639778])), (9232, array([0.4472136])), (9328, array([0.4472136])), (11942, array([0.4472136])), (15088, array([0.4472136]))]
[(19791, array([0.40097003])), (11890, array([0.35867227])), (759, array([0.34375507])), (2509, array([0.34199371])), (21407, array([0.33658565]))]


In [148]:
df.iloc[21407]

id                                                                                                                                                    22639736
location                                                                                                                                               Granada
company                                                                                                                                                    NaN
bio                                                                                                                  Mathematics and Computer Engineering. UGR
follower_list     [13903165, 17575711, 23178033, 22728603, 15173583, 11518373, 14114187, 22918075, 24752765, 22836496, 32845717, 23448652, 11302859, 22639789]
following_list                                                                                                                            [11518373, 13903165]
languages                                     

In [None]:
def get_n_recommendations(n, user_id):
    rec_list = []
    top_10_list = recommend(user_id)
    top_10_list_ids = [i[0] for i in top_10_list]
    for i in top_10_list:
        rec_list.append(df.iloc[i[0]])
    top_10_df = pd.concat(rec_list, axis=1).transpose()
    top_10_df["similarity"] = [i[1] for i in top_10_list]
    top_10_df = top_10_df[["following_list", "similarity"]]
    return top_10_df

    
top_10_df = get_top_10(909, sim)
top_10_df

### Adding a new user

In [None]:

def add_bio(df, text):

    queryTFIDF = vec.fit_transform(df["clean_input"].apply(lambda x: np.str_(x)))
    df.loc[len(df)] = [9999, 9999, 5462462, "Edinburgh", "CodeClan", "I like JavaScript", [], [], ["HTML", "CSS", "JavaScript"], "HTML CSS JavaScript", "I like JavaScript HTML CSS JavaScript"]
    new_data = df.iloc[len(df)-1]
    queryTFIDF_2 = vec.transform([new_data["clean_input"]])
    cosine_similarities = cosine_similarity(queryTFIDF, queryTFIDF_2).flatten()
    # # print(cosine_similarities)
    return cosine_similarities
# new_sim = add_bio(professionals_df, "Hello I like web development")
# print(len(new_sim))

new_sim_con = np.vstack((cos_sim, new_sim))
new_sim = np.append(new_sim, 1)


# new_sim = new_sim.reshape(-1, 1)
# print(new_sim)
# sim = np.concatenate((new_sim_con, new_sim), axis=1)

In [None]:
high_follows_df = df.copy()

dum = pd.get_dummies(high_follows_df['languages'].explode()).sum(level=0)
dum_sim = cosine_similarity(dum)


In [None]:
def recommend_by_languages(userId):
    print("Languages of recommendee:")
    print(high_follows_df.iloc[userId].languages)
    scores = list(enumerate(dum_sim[userId]))
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    sorted_scores=sorted_scores[1:]
    recommendations = [user for user in sorted_scores]
    return recommendations

def get_top_10(rec):
    rec_list = []
    top_10_list = rec[0:10]
    top_10_list_ids = [i[0] for i in top_10_list]
    for i in top_10_list:
        rec_list.append(high_follows_df.iloc[i[0]])
    top_10_df = pd.concat(rec_list, axis=1).transpose()
    top_10_df["similarity"] = [i[1] for i in top_10_list]
    top_10_df = top_10_df[["languages", "similarity"]]
    return top_10_df

rec = recommend_by_languages(10000)

print(" ")
print(" ")
print("Recommendations")
print(get_top_10(rec))

In [78]:
high_follows_df["follower_count"] = high_follows_df["follower_list"].str.len()
high_follows_df = high_follows_df.sort_values(by='follower_count', ascending=True)

## Connections Plot

In [419]:
import networkx as nx
import math


graph = nx.DiGraph()
sub_graph = high_follows_df

for index, row in sub_graph.iterrows():
    graph.add_node(row["id"])

for index, row in sub_graph.iterrows():
    f_list = row["follower_list"]
    for follower in f_list:
        graph.add_edge(follower, row["id"])

In [None]:
from matplotlib import pylab



def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(150, 150), dpi=100)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos,node_color=range(len(graph)),cmap=plt.cm.Reds)
    nx.draw_networkx_edges(graph,pos,alpha=0.4,arrows=False)

    # cut = 1.00
    # xmax = cut * max(xx for xx, yy in pos.values())
    # ymax = cut * max(yy for xx, yy in pos.values())
    # plt.xlim(0, xmax)
    # plt.ylim(0, ymax)

    plt.savefig(file_name)
    pylab.close()
    del fig

#Assuming that the graph g has nodes and edges entered
print(len(graph))
save_graph(graph,"my_graph_7.svg")