# Exploration and Analysis of GitHub User dataset

## Imports/Configuration

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%pip install wordcloud
from wordcloud import WordCloud
from ast import literal_eval
import json
import time
import requests

pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", 400)

Note: you may need to restart the kernel to use updated packages.


## Data Filtering/Cleaning

### Loading Dataset
Reading the json from the 10M GitHub Users dataset from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/T6ZRJT.

Due to the size of the dataset, it has to be loaded in chunks using the read_json() method from pandas.

In [188]:
chunks = pd.read_json("../data/data.json", lines=True, chunksize=10000)

### Filtering Data
Iterated through each chunk and filtered it by only accepting users that are
1. Not suspicious
2. Of the 'User' type
3. Have a bio
4. Following at least 1 other user
5. Has at least 1 repo

The filtering rules were made to be strict so that the best data can be used for providing recommendations, as there is such a large amount of data to extract from. This will also save time for calculations/adjustments made to the data later on.

Only the required columns were kept in the final dataframe from this process, 'id', 'location', 'company', 'bio', 'follower_list', 'following_list' and 'repo_list'.

In [None]:
count = 0

# Initialise list of dataframes to be kept
chunk_list = []

# Loop through all chunks
for chunk in chunks:
    chunk = chunk.loc[
                        (chunk["following_list"].notna())   
                        & (chunk["repo_list"].notna())
                        & (chunk["is_suspicious"] == False)
                        & (chunk["type"] == "User")
                        & (chunk["bio"].notna())
                        & (chunk["following_list"].map(lambda d: len(d) if d != None else None) > 0)
                        & (chunk["repo_list"].map(lambda d: len(d) if d != None else None) > 0)
                    ]

    # Only keep specified columns
    df_filtered = chunk[["id", "location", "company", "bio", "follower_list", "following_list", "repo_list"]]
    count += 10000
    print(count)
    chunk_list.append(df_filtered)

df = pd.concat(chunk_list)

df.to_csv("../data/filtered_data.csv", encoding='utf-8', index=False)

### Altering Data Columns

In [None]:
for i in df.index:
    repo_list = df.at[i, "repo_list"]
    languages = [d["language"] for d in repo_list]
    languages = [i for i in set(languages) if i is not None]
    df.at[i, 'repo_list'] = languages
    print(i)

df.rename(columns={"repo_list": "languages"}, inplace=True)
df = df.loc[(df["languages"].map(lambda d: len(d)) > 0)]

In [195]:
df = df[df["location"].notna()]

#### Removing absent users from following_list and follower_list

In [5]:
def existing_users(df, col):
    found_users_list = []
    for i in df.index:
        following_list = df.at[i, col]
        users_to_remove = []
        for item in following_list:
            if not df[df["id"] == item].empty:
                found_users_list.append(df[df["id"] == item])
            else:
                users_to_remove.append(item)
        df.at[i, col] = [x for x in following_list if x not in users_to_remove]
                
    return pd.concat(found_users_list)



In [None]:
existing_followed_users = existing_users(df, "following_list")
# existing_following_users = existing_users(df, "follower_list")

In [24]:
df = df.loc[
    (df["following_list"].map(lambda d: len(d)) > 0)
]

In [152]:
df["languages_str"] = df["languages"].transform(lambda x: [f"[lang_{i}]" for i in x])
df["languages_str"] = [' '.join(map(str, l)) for l in df['languages_str']]

df["location_str"] = "[loc_" + df["location"] + "]"
df.loc[df["location_str"].isna(), "location_str"] = ""

df["clean_input"] = df["bio"] + " " + df["languages_str"]

### Extracting Job Titles

In [None]:
f = open('job_titles.json')
job_data = json.load(f)
job_data = job_data["job-titles"]

count = 0

def extract_job(row):
    extracted_list = [str(word).lower() for word in job_data if word in str(row["bio"]).lower()]
    global count
    count += 1
    print(count)
    if len(extracted_list) == 0:
        return None
    else:
        return max(extracted_list, key=len)

df['job'] = df.apply(extract_job, axis=1)

### Lat/Long Columns

In [None]:
locations_dict = {}

In [None]:
count = 0


location_notna_df = df.copy()
location_notna_df = location_notna_df[location_notna_df["location"].notna()]

for i, row in location_notna_df.iterrows():
    count += 1
    if row["location"].lower() not in locations_dict:
        response = requests.get(f"https://geocode.maps.co/search?q={row['location']}")
        if response.status_code == 200 and len(response.json()) > 0:
            res_data = response.json()
            df.at[i,'lat'] = res_data[0]["lat"]
            df.at[i,'lon'] = res_data[0]["lon"]
            locations_dict[row["location"].lower()] = {"lat": res_data[0]["lat"], "lon": res_data[0]["lon"], "new_location": res_data[0]["display_name"]}
            df.at[i,'new_location'] = res_data[0]["display_name"]
        else:
            df.at[i,'lat'] = None
            df.at[i,'lon'] = None
        time.sleep(0.6)
    else:
        print(f"FOUND: {row['location']}")
        df.at[i,'lat'] = locations_dict[row["location"].lower()]["lat"]
        df.at[i,'lon'] = locations_dict[row["location"].lower()]["lon"]
        df.at[i,'new_location'] = locations_dict[row["location"].lower()]["new_location"]

    print(count)

In [26]:
df.to_csv("../data/data.csv", encoding='utf-8', index=False)

### Load cleaned data

In [150]:
df = pd.read_csv("../data/data_3.csv", delimiter=',', converters={"follower_list": pd.eval, "following_list": pd.eval, "languages": literal_eval})

df.columns = (df.columns.str.strip().str.lower()
              .str.replace(' ', '_')
              .str.replace('(', '')
              .str.replace(')', ''))

df["follower_list"] = df["follower_list"].apply(lambda x: x.tolist())
df["following_list"] = df["following_list"].apply(lambda x: x.tolist())

  df.columns = (df.columns.str.strip().str.lower()
  df.columns = (df.columns.str.strip().str.lower()


## Data Analysis

### Wordcloud
Wordcloud generated from bio descriptions of all users. This shows that words like 'Computer Science', 'Developer', 'University' etc. are extremely common between bios which means we probably don't want to recommend users based on these keywords.

In [None]:
text = ""
for i in df["bio"]:
    text += str(i) + " "

plt.figure(figsize=(8, 8))

x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

wc = WordCloud(
    width=500,
    height=500,
    background_color="white",
    min_font_size=6,
    repeat=True,
    mask=mask,
)
wc.generate(text)

plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.title(f"Most Used Words", fontsize=20)

### Visualisation User Jobs

In [None]:
number_of_students = len(df[df["job"] == "student"].index)
number_of_professionals = len(df[(df["job"].notna()) & (df["job"] != "student")].index)
number_of_unemployed = len(df.index) - (number_of_students + number_of_professionals)

import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Student', 'Professional', 'Unemployed']
students = [number_of_students, number_of_professionals, number_of_unemployed]
ax.bar(langs,students)
plt.show()

In [None]:
all_jobs = df["job"].value_counts()


all_jobs = all_jobs[0:20]
labels = all_jobs.index
sizes = all_jobs.values.tolist()

fig, ax = plt.subplots(figsize=(8, 8))

wedges, texts = ax.pie(sizes, labels=labels)
plt.show()


### Plot of User Geographic Locations

In [3]:
map_plot_df = df.copy()
map_plot_df = map_plot_df[map_plot_df["new_location"].notna()]
map_plot_df["country"] = map_plot_df.apply(lambda x: x["new_location"].split(", ")[-1], axis=1)
map_plot_df["follower_count"] = map_plot_df.apply(lambda x: len(x["follower_list"]), axis=1)
map_plot_df["following_count"] = map_plot_df.apply(lambda x: len(x["following_list"]), axis=1)

In [4]:
%pip install plotly
%pip install nbformat 
import plotly.express as px

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
fig = px.scatter_geo(map_plot_df, lat="lat", lon="lon", hover_name="id", size=map_plot_df["following_count"]+1)
fig.update_layout(title="World Map", title_x=0.5)
fig.update_traces(marker=dict(line=dict(width=0), color="#157D9D"))
fig.update_geos(
    showcountries=True,
    countrycolor="Grey",
    resolution=110,
)
fig.show()

## Recommender System

In [156]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer#
# rs_df = df.copy()[["id", "following_list"]][0:500]

In [None]:
existing_users(df, "following_list")

In [9]:
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cdist

def merge(list1, list2):  
    # listnp.nan_to_num(list1, copy=True, nan=0.0, posinf=None, neginf=None)
    # np.nan_to_num(list2, copy=True, nan=0.0, posinf=None, neginf=None)
    merged_list = [(list1[i], list2[i]) for i in range(0, len(list1))]
    return merged_list

lat_lon_values = merge(df["lat"].to_numpy(), df["lon"].to_numpy())
distances_matrix = nan_euclidean_distances(lat_lon_values, lat_lon_values)
np.nan_to_num(distances_matrix, copy=False)
normed_dist = normalize(distances_matrix, norm="l1")




MemoryError: Unable to allocate 5.05 GiB for an array with shape (26033, 26033) and data type float64

### Content-based Filtering

In [169]:
def similarity_by_content(sim):
    scores = enumerate(sim)
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    return sorted_scores[0:5]

In [170]:
from sklearn.preprocessing import normalize

def top_5_similarities(following_list, weight=1):
    vec = TfidfVectorizer(strip_accents="unicode", stop_words="english", min_df=3)
    vecs = vec.fit_transform(df["clean_input"].apply(lambda x: str(x)))
    arr_list = []
    for user in following_list:
        sim = cosine_similarity(vecs, vecs[user])
        sim[user] = 0
        weighted_sim = (1 * (sim*weight)) # + (0.25 * normed_dist[user])
        arr_list.append(weighted_sim)
    return sum(arr_list)


def get_recent_follows(user):
    following = df.loc[user].following_list
    return [df[df["id"] == i].index[0] for i in following if not df[df["id"] == i].empty]


### Collaborative Filtering

In [171]:
from scipy.sparse import csr_matrix

def recommend_by_following(userId):
    id_dict = dict(zip(df['id'], range(len(df))))

    data = []
    for i, row in df.iterrows():
        user_id = row['id']
        following = row['following_list']
        row_index = id_dict[user_id]
        for f in following:
            if f in id_dict:
                col_index = id_dict[f]
                data.append((row_index, col_index, 1))

    n_users = len(df)
    n_items = len(df)
    sparse_matrix = csr_matrix((np.ones(len(data), dtype=np.int32), (np.array([x[0] for x in data]), np.array([x[1] for x in data]))), shape=(n_users, n_items), dtype=np.int32)

    profile_cs = cosine_similarity(sparse_matrix, sparse_matrix[userId])
    scores = enumerate(profile_cs)
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    sorted_scores=[i for i in sorted_scores if i[0] != userId]
    return sorted_scores[0:5]

# recommend_by_following(6000)

In [179]:
def recommend(user):
    user_sim = top_5_similarities(get_recent_follows(user), 0.2)
    collaborative_f = recommend_by_following(user)
    content_f = similarity_by_content(user_sim)
    combined = [x for pair in zip(collaborative_f, content_f) for x in pair]
    return combined
recommend(7)

[(17155, array([0.81649658])),
 (3335, array([0.19997614])),
 (1133, array([0.57735027])),
 (13462, array([0.19647863])),
 (2031, array([0.57735027])),
 (3735, array([0.19122548])),
 (2134, array([0.57735027])),
 (22992, array([0.1893497])),
 (4829, array([0.57735027])),
 (2530, array([0.18898637]))]

In [208]:
def get_n_recommendations(user_id, n=10):
    print(df.iloc[user_id][["id", "following_list", "location", "clean_input"]])
    for i in df.iloc[user_id]["following_list"]:
        print(f"{i}, ", sep=' ', end='', flush=True)
    rec_list = []
    top_10_list = recommend(user_id)
    top_10_list_ids = [i[0] for i in top_10_list]
    for i in top_10_list:
        rec_list.append(df.iloc[i[0]])
    top_10_df = pd.concat(rec_list, axis=1).transpose()
    top_10_df["similarity"] = [i[1] for i in top_10_list]
    top_10_df = top_10_df[["id", "location", "bio", "similarity"]]
    return top_10_df

    
top_10_df = get_n_recommendations(32493)
top_10_df

id                                                                                                                                                                                                                                                                                                                                                                                                                        2605401
following_list    [68807, 200477, 202450, 376820, 435209, 506438, 574575, 624770, 686864, 737319, 808759, 1040151, 1255483, 1285850, 1483925, 1519100, 2165397, 2178968, 2593840, 2664036, 2857896, 2866211, 2992033, 3063500, 3258646, 4196457, 4818419, 4857149, 5186093, 5368944, 5621393, 5807827, 6145194, 6207220, 7268597, 7271917, 7500298, 7642826, 7833788, 8012065, 8440900, 8475606, 9081832, 9162319, 9811859, 10...
location                                                                                                                                                            

Unnamed: 0,id,location,bio,similarity
6053,28510687,Indonesia,Full-Stack Developer | Android Developer | Game Developer,[0.5128812124138867]
20333,2040348,"Beijing, China",一个默默无闻的PHP从业者。,[3.486235283024907]
4680,11882869,"Kyoto, Japan","I love doing stuff that I haven't done before, so if you are reading this then you are probably following me. I just want to say thank you ;)",[0.3309652594769213]
537,7944237,beijing,唯有跑步和代码不可辜负……,[3.4832098397712907]
31285,10668756,"Scotland, UK",Freelance IT Consultant/Web Developer \r\nand Data Science Student,[0.26111648393354675]
20753,3054812,Jalandhar,A physiological ambivert ;),[3.4832098397712907]
23142,18615729,Indonesia,Front-end web developer,[0.23836564731139806]
11500,11427457,"Shenzhen, China",🇨🇳 🎵 🏓 ⌨️ 🍵 ☕️,[3.4654373514514574]
32224,13572961,"Tokyo,Japan",dan1122bluered@gmail.com,[0.22613350843332267]
25613,8913358,smvdu,Bughunter,[3.422561355787301]


In [195]:
df[df["following_list"].str.len() > 20]

Unnamed: 0,id,location,company,bio,follower_list,following_list,languages,languages_str,location_str,clean_input,lat,lon,new_location
283,706206,"Ho Chi Minh City, Vietnam",vmtri.com,"coffee, books, machine learning, deep learning 🚀","[1831367, 2398264, 5716193, 1456842, 10237571, 12580303, 14210885, 10179239, 844215, 11807680, 21350445, 108806, 9999958, 9509132, 6673982, 6514093, 19983012, 15663770, 15168345, 10865446, 5107290, 16810887, 1487658, 17018802, 22243642, 3646388, 20609853, 7142025, 11909428, 16146873, 4859453, 8471029, 5878421, 5334715, 3196970, 10631614, 629029, 2184721, 8535306, 55102, 13154805, 4550606, 2262...","[303270, 654346, 855763, 968172, 1367529, 1610102, 1658742, 1684732, 1890549, 2412413, 3340388, 4473110, 5209532, 6041942, 6784567, 6843015, 6949363, 7101850, 7226751, 7544235, 7935808, 8115763, 8695838, 9861437, 10160626, 10266208, 10503729, 10846896, 11813717, 11946321, 12767206, 12807876, 13488275, 13730405, 14815269, 15006217, 17217068, 18532657, 20217403, 22739177, 24451931, 24511419, 265...","[HTML, Jupyter Notebook]",[lang_HTML] [lang_Jupyter Notebook],"[loc_Ho Chi Minh City, Vietnam]","coffee, books, machine learning, deep learning 🚀 [lang_HTML] [lang_Jupyter Notebook]",10.771551,106.698380,"Thành phố Hồ Chí Minh, Việt Nam"
690,1963546,NYC,,"Blippar, We Are Social","[937826, 8196581, 5754073, 107614, 10654028, 93859, 6816583, 6443378, 662996, 11982209, 6602059, 193030, 1553055, 2405946, 5037675, 534887, 9162757, 273379, 43899, 6207220, 17663473, 12552758, 7314531, 24266815, 1503874, 3369033, 4001940, 8454286, 8229425, 110165, 25972276, 114122, 7587896, 25685337, 24416962, 335121, 17685540, 10056444, 418638, 14251570, 7357679, 9830365, 33904562, 21250813, ...","[31209, 40227, 125889, 346540, 806129, 848087, 1008308, 1405568, 1902353, 2587873, 2704515, 3615408, 3880963, 4227097, 6137136, 6577375, 6948067, 6996668, 8590999, 8775460, 8820723, 9315769, 10132540]","[Objective-C++, Go, Shell, CoffeeScript, C, Scala, Julia, Ruby, C#, Java, C++, OpenEdge ABL, Objective-C, HTML, Lua, CMake, Python, Jupyter Notebook, HCL, JavaScript]",[lang_Objective-C++] [lang_Go] [lang_Shell] [lang_CoffeeScript] [lang_C] [lang_Scala] [lang_Julia] [lang_Ruby] [lang_C#] [lang_Java] [lang_C++] [lang_OpenEdge ABL] [lang_Objective-C] [lang_HTML] [lang_Lua] [lang_CMake] [lang_Python] [lang_Jupyter Notebook] [lang_HCL] [lang_JavaScript],[loc_NYC],"Blippar, We Are Social [lang_Objective-C++] [lang_Go] [lang_Shell] [lang_CoffeeScript] [lang_C] [lang_Scala] [lang_Julia] [lang_Ruby] [lang_C#] [lang_Java] [lang_C++] [lang_OpenEdge ABL] [lang_Objective-C] [lang_HTML] [lang_Lua] [lang_CMake] [lang_Python] [lang_Jupyter Notebook] [lang_HCL] [lang_JavaScript]",40.712728,-74.006015,"New York, United States"
727,1480950,Beijing,,ideaOS,"[1324616, 12042518, 5877145, 5772211, 4069972, 9153294, 17778197, 16549630, 20633364, 892742, 18028533, 3348522, 7036706, 29301838, 8679134, 4165054, 3730716, 12518061, 418638, 1774898, 14251570, 7357679, 23460812, 31426614, 33884947, 1464084, 6252528, 3275586]","[7958, 62295, 251980, 474246, 506025, 836893, 882822, 1158274, 1468993, 1606842, 1814071, 2075801, 3436659, 3991481, 5007149, 6072743, 6110163, 6186284, 6859696, 7480780, 7744927, 7770035, 8268122, 8285047, 8390081, 11258736, 11262741, 12420577, 12581809, 12692358, 12775019, 13112992, 13552664, 16360684, 17438617]","[CSS, Python, PHP, Swift, Objective-C, JavaScript, HTML, Vue]",[lang_CSS] [lang_Python] [lang_PHP] [lang_Swift] [lang_Objective-C] [lang_JavaScript] [lang_HTML] [lang_Vue],[loc_Beijing],ideaOS [lang_CSS] [lang_Python] [lang_PHP] [lang_Swift] [lang_Objective-C] [lang_JavaScript] [lang_HTML] [lang_Vue],39.906217,116.391276,"北京市, 东城区, 北京市, 100010, 中国"
798,8888188,"Hollidaysburg, PA",TechHiring.com,Job platform for Computer Science & Information Technology. Providing services & resources to assist job seekers in the process of finding career opportunities,"[5877145, 14025333, 9084699, 11949511, 6241939, 5231771, 302192, 5664768, 9999958, 427220, 23435604, 12578931, 4219058, 25875040, 225633, 16832353, 6903227, 10283811, 15067935, 10950817, 17304286, 12629649, 18502952, 10980722, 22209744, 8057829, 8644248, 22035945, 19317913, 15186858, 418638, 10523517, 6508763, 8969346, 14251570, 31850587, 18353464, 28358166, 25314586, 9143084, 21978330, 127039...","[29512, 212058, 225633, 365914, 586511, 602559, 1273691, 2440089, 2871552, 2942664, 3657251, 3946178, 3988672, 4048656, 5664768, 5963934, 6392644, 6490972, 6505165, 6961979, 7518001, 8008527, 8451336, 8646214, 8833582, 9338658, 9370448, 9439807, 10606399, 10688959, 11710043, 11970000, 12703975, 13250774, 13984734, 15067935, 15173843, 15948501, 16740388, 17831460, 18502952, 19291544, 21963725, ...","[Java, CSS, C++, Python, PHP, HTML, Ruby, JavaScript, C#]",[lang_Java] [lang_CSS] [lang_C++] [lang_Python] [lang_PHP] [lang_HTML] [lang_Ruby] [lang_JavaScript] [lang_C#],"[loc_Hollidaysburg, PA]",Job platform for Computer Science & Information Technology. Providing services & resources to assist job seekers in the process of finding career opportunities [lang_Java] [lang_CSS] [lang_C++] [lang_Python] [lang_PHP] [lang_HTML] [lang_Ruby] [lang_JavaScript] [lang_C#],40.430774,-78.390083,"Hollidaysburg, Blair County, Pennsylvania, United States"
918,12510899,Nairobi,,"I'm an optimistic ,ambitious person who really enjoys programming and web development.Python Developer","[13433385, 13447301, 2420182, 10762793, 12864384, 2166051, 7640693, 4698934, 2271973, 16535377, 5633006, 25636288, 7533831, 5968269, 17765176, 26500955, 18521977, 6360991, 27367862, 19211407, 242966, 586490, 28964409, 20838437, 17453166, 25599665, 937340, 14915525, 1010556, 6210056, 418638, 17970203, 29044688, 30701246, 32088504, 29328923, 25929791, 14251570, 8969346, 21126731, 29545757, 30392...","[121679, 253434, 294861, 500169, 586490, 1761197, 2040614, 3388731, 3425066, 5135662, 5353178, 5633006, 5779791, 5803250, 5856560, 5968269, 5986979, 6486733, 6711217, 7844509, 10030028, 11217077, 11882869, 12715894, 16223682, 16631356, 17286899, 17970203, 18420962, 18521977, 19430113, 19547833, 19914657, 20207643, 22425140, 22851567, 23236275, 25636288, 26159140, 26257001, 27367862, 28865739, ...","[Java, CSS, Makefile, C++, Go, Python, PHP, HTML, PLpgSQL, Ruby, JavaScript, C]",[lang_Java] [lang_CSS] [lang_Makefile] [lang_C++] [lang_Go] [lang_Python] [lang_PHP] [lang_HTML] [lang_PLpgSQL] [lang_Ruby] [lang_JavaScript] [lang_C],[loc_Nairobi],"I'm an optimistic ,ambitious person who really enjoys programming and web development.Python Developer [lang_Java] [lang_CSS] [lang_Makefile] [lang_C++] [lang_Go] [lang_Python] [lang_PHP] [lang_HTML] [lang_PLpgSQL] [lang_Ruby] [lang_JavaScript] [lang_C]",-1.283253,36.817245,"Nairobi, Kenya"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32065,6948067,Rajkot,,"Indian,Tech. Entrepreneur, Android, Ios, Windows, Opensource Stuff, Design,IOT,Beacon,","[4572642, 1931778, 7663246, 5696448, 5872045, 9447028, 5705888, 10236771, 4267266, 10480629, 2400629, 1597860, 5443125, 6527398, 6134885, 10432282, 4519354, 5147253, 4577029, 4647461, 10930303, 8478702, 5053620, 11643472, 9879836, 10356431, 7036706, 8563626, 12166229, 1940316, 5113478, 4216947, 11497588, 4253712, 7369323, 12608421, 2311972, 5143361, 6658179, 8357630, 7982834, 12504512, 1185275...","[306, 6264, 8944, 16604, 18065, 18282, 19003, 23027, 24283, 25312, 25424, 50357, 53433, 54480, 55348, 75418, 87319, 87605, 92138, 107834, 131628, 146166, 158265, 160168, 163333, 177527, 181342, 190671, 194200, 200448, 220619, 224770, 239384, 242295, 244068, 251980, 264912, 274623, 280190, 290864, 295446, 295870, 304544, 311077, 319726, 324076, 335765, 338824, 343172, 343845, 361649, 368462, 37...","[Dart, Objective-C++, Go, Emacs Lisp, Shell, C, Eagle, Groff, Scala, Makefile, Erlang, Ruby, XSLT, Groovy, C#, Java, CSS, C++, FreeMarker, PHP, Arduino, GCC Machine Description, Objective-C, HTML, Cirru, AutoIt, IDL, Python, TeX, PLSQL, Swift, JavaScript, Kotlin]",[lang_Dart] [lang_Objective-C++] [lang_Go] [lang_Emacs Lisp] [lang_Shell] [lang_C] [lang_Eagle] [lang_Groff] [lang_Scala] [lang_Makefile] [lang_Erlang] [lang_Ruby] [lang_XSLT] [lang_Groovy] [lang_C#] [lang_Java] [lang_CSS] [lang_C++] [lang_FreeMarker] [lang_PHP] [lang_Arduino] [lang_GCC Machine Description] [lang_Objective-C] [lang_HTML] [lang_Cirru] [lang_AutoIt] [lang_IDL] [lang_Python] [lan...,[loc_Rajkot],"Indian,Tech. Entrepreneur, Android, Ios, Windows, Opensource Stuff, Design,IOT,Beacon, [lang_Dart] [lang_Objective-C++] [lang_Go] [lang_Emacs Lisp] [lang_Shell] [lang_C] [lang_Eagle] [lang_Groff] [lang_Scala] [lang_Makefile] [lang_Erlang] [lang_Ruby] [lang_XSLT] [lang_Groovy] [lang_C#] [lang_Java] [lang_CSS] [lang_C++] [lang_FreeMarker] [lang_PHP] [lang_Arduino] [lang_GCC Machine Description] ...",22.305326,70.802838,"Rajkot, Rajkot Taluka, Rajkot, Gujarat, 360001, India"
32339,8662447,beijing china,Intel,i'm a Full Stack Developer.I very love my cat.I like to collect materials for all kinds of development needs.,"[2678063, 18158726, 6673982, 7576876, 5790649, 22917484, 23097360, 7347813, 20527108, 19618339, 1493571, 2184721, 25549033, 9761138, 848817, 20694445, 19061261, 3076393, 28582405, 4520412, 1587870, 19382312, 16503228, 11600822, 4224436, 6684599, 11816788, 76903, 418638, 861292, 4980738, 529003, 10236771, 5789813, 6508763, 31426614, 202450, 14251570, 12897006, 11921874, 16574582, 3633628, 32831...","[9520, 29411, 29839, 39294, 57670, 76903, 95989, 100993, 109408, 139364, 202450, 281000, 303752, 345050, 404105, 435118, 559262, 570861, 582313, 610615, 952574, 1051085, 1075861, 1120875, 1169715, 1204591, 1235813, 1236304, 1240920, 1329093, 1329974, 1332069, 1443513, 1468993, 1489725, 1533128, 1640839, 1650692, 1684732, 2174072, 2768977, 2857896, 3074688, 3110267, 3206028, 3245280, 4060222, 4...","[VimL, Emacs Lisp, API Blueprint, Scheme, Brainfuck, Perl, Nim, Scala, ooc, ActionScript, Forth, Oz, C++, PHP, KiCad, Objective-C, IDL, Swift, Vala, Max, Cuda, Verilog, SMT, Web Ontology Language, M4, Haxe, Groff, Turing, Makefile, Chapel, Gherkin, R, Julia, TypeScript, Agda, Vim script, CSS, Assembly, Elixir, Arduino, OpenEdge ABL, Mathematica, Lua, Roff, Logos, Factor, Jupyter Notebook, POV-...",[lang_VimL] [lang_Emacs Lisp] [lang_API Blueprint] [lang_Scheme] [lang_Brainfuck] [lang_Perl] [lang_Nim] [lang_Scala] [lang_ooc] [lang_ActionScript] [lang_Forth] [lang_Oz] [lang_C++] [lang_PHP] [lang_KiCad] [lang_Objective-C] [lang_IDL] [lang_Swift] [lang_Vala] [lang_Max] [lang_Cuda] [lang_Verilog] [lang_SMT] [lang_Web Ontology Language] [lang_M4] [lang_Haxe] [lang_Groff] [lang_Turing] [lang_M...,[loc_beijing china],i'm a Full Stack Developer.I very love my cat.I like to collect materials for all kinds of development needs. [lang_VimL] [lang_Emacs Lisp] [lang_API Blueprint] [lang_Scheme] [lang_Brainfuck] [lang_Perl] [lang_Nim] [lang_Scala] [lang_ooc] [lang_ActionScript] [lang_Forth] [lang_Oz] [lang_C++] [lang_PHP] [lang_KiCad] [lang_Objective-C] [lang_IDL] [lang_Swift] [lang_Vala] [lang_Max] [lang_Cuda] [...,39.906217,116.391276,"北京市, 东城区, 北京市, 100010, 中国"
32341,4116872,Macau,,"Native Language is Cantonese, A Programmer live in Macau","[4500588, 478921, 861292, 5338746, 5528943, 5397581, 4323584, 526959, 375116, 5516167, 8554998, 9197550, 20527108, 11289349, 7314531, 25685337, 13523904, 3446627, 10236771, 9513894, 3076393, 31426614, 14251570, 30257640, 14243883, 418638, 9830365, 32831059, 23258790, 13495487, 34407716, 2354218, 26090768, 26328262, 21250813, 7511789, 15853190, 16224591, 29916957, 250445, 15626022, 26346422, 23...","[5849, 10736, 31844, 107834, 343204, 625264, 952574, 1067743, 1128933, 1658742, 1730398, 2016679, 2507027, 3731558, 3749170, 3880963, 4643504, 5589040, 6154626, 6158206, 6441756, 6970508, 8012205, 9245070, 9897012, 10172439, 10295295, 11577776, 12696900, 12701036, 13488275, 14179822, 15100009, 15272722, 15798030, 16224591, 16754253, 16767288, 18215991, 22273555, 22428579, 22593407, 22956341, 2...","[Go, Shell, Brainfuck, Verilog, C, LLVM, Groff, Coq, Scala, Matlab, Makefile, Erlang, OCaml, Prolog, Haskell, Ruby, C#, Java, Assembly, C++, Yacc, HTML, Lua, CMake, Python, TeX, Jupyter Notebook, Eagle, Crystal, Rust]",[lang_Go] [lang_Shell] [lang_Brainfuck] [lang_Verilog] [lang_C] [lang_LLVM] [lang_Groff] [lang_Coq] [lang_Scala] [lang_Matlab] [lang_Makefile] [lang_Erlang] [lang_OCaml] [lang_Prolog] [lang_Haskell] [lang_Ruby] [lang_C#] [lang_Java] [lang_Assembly] [lang_C++] [lang_Yacc] [lang_HTML] [lang_Lua] [lang_CMake] [lang_Python] [lang_TeX] [lang_Jupyter Notebook] [lang_Eagle] [lang_Crystal] [lang_Rust],[loc_Macau],"Native Language is Cantonese, A Programmer live in Macau [lang_Go] [lang_Shell] [lang_Brainfuck] [lang_Verilog] [lang_C] [lang_LLVM] [lang_Groff] [lang_Coq] [lang_Scala] [lang_Matlab] [lang_Makefile] [lang_Erlang] [lang_OCaml] [lang_Prolog] [lang_Haskell] [lang_Ruby] [lang_C#] [lang_Java] [lang_Assembly] [lang_C++] [lang_Yacc] [lang_HTML] [lang_Lua] [lang_CMake] [lang_Python] [lang_TeX] [lang_...",22.175760,113.551414,"澳門 Macau, 中国"
32470,2605401,Japan,,I like Win32 API and assembly language.,"[1309148, 2256100, 6120055, 15140871, 2124614, 52265, 17972943, 3207647, 23435604, 7911232, 17880159, 7533831, 20743967, 937340, 7515099, 24356280, 7173528, 5807827, 7551680, 758325, 30332015, 20459035, 2978905, 6207220, 14113526, 11867058, 20284373, 16503228, 18520338, 26343254, 418638, 10482123, 13447250, 8012065, 10179239, 18145625, 11600822, 411852, 1483925, 19571809, 3076393, 5789813, 969...","[68807, 200477, 202450, 376820, 435209, 506438, 574575, 624770, 686864, 737319, 808759, 1040151, 1255483, 1285850, 1483925, 1519100, 2165397, 2178968, 2593840, 2664036, 2857896, 2866211, 2992033, 3063500, 3258646, 4196457, 4818419, 4857149, 5186093, 5368944, 5621393, 5807827, 6145194, 6207220, 7268597, 7271917, 7500298, 7642826, 7833788, 8012065, 8440900, 8475606, 9081832, 9162319, 9811859, 10...","[CSS, PowerShell, C++, Swift, JavaScript, C]",[lang_CSS] [lang_PowerShell] [lang_C++] [lang_Swift] [lang_JavaScript] [lang_C],[loc_Japan],I like Win32 API and assembly language. [lang_CSS] [lang_PowerShell] [lang_C++] [lang_Swift] [lang_JavaScript] [lang_C],36.574844,139.239418,日本


### Adding a new user

In [None]:

def add_bio(df, text):

    queryTFIDF = vec.fit_transform(df["clean_input"].apply(lambda x: np.str_(x)))
    df.loc[len(df)] = [9999, 9999, 5462462, "Edinburgh", "CodeClan", "I like JavaScript", [], [], ["HTML", "CSS", "JavaScript"], "HTML CSS JavaScript", "I like JavaScript HTML CSS JavaScript"]
    new_data = df.iloc[len(df)-1]
    queryTFIDF_2 = vec.transform([new_data["clean_input"]])
    cosine_similarities = cosine_similarity(queryTFIDF, queryTFIDF_2).flatten()
    # # print(cosine_similarities)
    return cosine_similarities
# new_sim = add_bio(professionals_df, "Hello I like web development")
# print(len(new_sim))

new_sim_con = np.vstack((cos_sim, new_sim))
new_sim = np.append(new_sim, 1)


# new_sim = new_sim.reshape(-1, 1)
# print(new_sim)
# sim = np.concatenate((new_sim_con, new_sim), axis=1)

In [None]:
high_follows_df = df.copy()

dum = pd.get_dummies(high_follows_df['languages'].explode()).sum(level=0)
dum_sim = cosine_similarity(dum)


In [None]:
def recommend_by_languages(userId):
    print("Languages of recommendee:")
    print(high_follows_df.iloc[userId].languages)
    scores = list(enumerate(dum_sim[userId]))
    sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)
    sorted_scores=sorted_scores[1:]
    recommendations = [user for user in sorted_scores]
    return recommendations

def get_top_10(rec):
    rec_list = []
    top_10_list = rec[0:10]
    top_10_list_ids = [i[0] for i in top_10_list]
    for i in top_10_list:
        rec_list.append(high_follows_df.iloc[i[0]])
    top_10_df = pd.concat(rec_list, axis=1).transpose()
    top_10_df["similarity"] = [i[1] for i in top_10_list]
    top_10_df = top_10_df[["languages", "similarity"]]
    return top_10_df

rec = recommend_by_languages(10000)

print(" ")
print(" ")
print("Recommendations")
print(get_top_10(rec))

In [78]:
high_follows_df["follower_count"] = high_follows_df["follower_list"].str.len()
high_follows_df = high_follows_df.sort_values(by='follower_count', ascending=True)

## Connections Plot

In [419]:
import networkx as nx
import math


graph = nx.DiGraph()
sub_graph = high_follows_df

for index, row in sub_graph.iterrows():
    graph.add_node(row["id"])

for index, row in sub_graph.iterrows():
    f_list = row["follower_list"]
    for follower in f_list:
        graph.add_edge(follower, row["id"])

In [None]:
from matplotlib import pylab



def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(150, 150), dpi=100)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos,node_color=range(len(graph)),cmap=plt.cm.Reds)
    nx.draw_networkx_edges(graph,pos,alpha=0.4,arrows=False)

    # cut = 1.00
    # xmax = cut * max(xx for xx, yy in pos.values())
    # ymax = cut * max(yy for xx, yy in pos.values())
    # plt.xlim(0, xmax)
    # plt.ylim(0, ymax)

    plt.savefig(file_name)
    pylab.close()
    del fig

#Assuming that the graph g has nodes and edges entered
print(len(graph))
save_graph(graph,"my_graph_7.svg")