# Imports

In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Loading the dataset

In [2]:
with open('resources/final_dataset.json' , 'r') as f:
    ob = json.load(f)
data = ob['data']
len(data)


20111

# Getting all the languages avaialble

In [6]:
languages = set()
for user in data:
    for repo in user['repo_details']:
        languages.add(repo['language'])

print(languages)

{'VCL', 'Idris', 'SQLPL', 'UnrealScript', 'Imba', 'JSON', 'Vim Snippet', 'Perl', 'Handlebars', 'Nemerle', 'Logos', 'Limbo', 'PureBasic', 'ObjectScript', 'PogoScript', 'QMake', 'Nunjucks', 'Kaitai Struct', 'Io', 'Swift', 'C#', 'Asymptote', 'MDX', 'Mathematica', 'Objective-J', 'SourcePawn', 'Protocol Buffer', 'LOLCODE', 'Nushell', 'Erlang', 'SystemVerilog', 'Dylan', 'CMake', 'xBase', 'Puppet', 'Java', 'DCPU-16 ASM', 'Scala', 'Monkey C', 'Sass', 'Groff', 'MQL4', 'C++', 'YARA', 'CLIPS', 'Fancy', 'Procfile', 'GLSL', 'TL-Verilog', 'TLA', '1C Enterprise', 'J', 'Boo', 'Ada', 'Marko', 'mcfunction', 'LiveScript', 'ABAP', 'ApacheConf', 'OpenSCAD', 'Dhall', 'Bicep', 'Mirah', 'Visual Basic .NET', 'Pony', 'XS', 'Python', 'VimL', 'Nextflow', 'Cool', 'SuperCollider', 'DIGITAL Command Language', 'XC', 'OpenEdge ABL', 'Volt', 'Visual Basic', 'AutoHotkey', 'Isabelle', 'Monkey', 'Xtend', 'Yacc', 'DTrace', 'OCaml', 'Zig', 'LookML', 'Jinja', 'Shen', 'ASL', 'JetBrains MPS', 'ATS', "Cap'n Proto", 'Ragel in Ru

# Data preprocessing
- counting all the aggregate features for labeling

In [3]:
for user in data:
    user['total_stars'] = 0
    user['total_forks'] = 0
    user['total_watchers'] = 0
    user['languages'] = set()
    user['repo_count'] = len(user['repo_details'])
    for repo in user['repo_details'] :
        if repo['language'] is not None:
            user['languages'].add(repo['language'])
        user['total_stars'] += repo['stars']
        user['total_forks'] += repo['forks']
        user['total_watchers'] += repo['watchers']
        

print(data[1])

{'login': 'stiff', 'user_url': 'https://api.github.com/users/stiff', 'html_url': 'https://github.com/stiff', 'repos_url': 'https://api.github.com/users/stiff/repos', 'followers': 14, 'repo_details': [{'language': 'Ruby', 'stars': 0, 'forks': 0, 'watchers': 0}, {'language': 'Ruby', 'stars': 0, 'forks': 0, 'watchers': 0}, {'language': 'Ruby', 'stars': 64, 'forks': 16, 'watchers': 64}, {'language': 'Ruby', 'stars': 1, 'forks': 0, 'watchers': 1}, {'language': 'CoffeeScript', 'stars': 0, 'forks': 0, 'watchers': 0}, {'language': 'JavaScript', 'stars': 0, 'forks': 1, 'watchers': 0}, {'language': 'JavaScript', 'stars': 1, 'forks': 0, 'watchers': 1}, {'language': 'Ruby', 'stars': 1, 'forks': 0, 'watchers': 1}, {'language': 'JavaScript', 'stars': 0, 'forks': 1, 'watchers': 0}, {'language': 'Ruby', 'stars': 1, 'forks': 0, 'watchers': 1}, {'language': 'JavaScript', 'stars': 0, 'forks': 0, 'watchers': 0}, {'language': 'Ruby', 'stars': 6, 'forks': 0, 'watchers': 6}, {'language': 'JavaScript', 'stars

# Removing the samples without any repositories

In [4]:
new_data = [user for user in data if user['repo_details']]
print(len(new_data))


15551


# Splitting the dataset into a small one for labeling

In [5]:
req_data = new_data[:3000]
for user in req_data:
    user['label'] = 0


# Loose labeling logic

In [6]:
top_tier_languages = {'Python','C', 'C++', 'JavaScript', 'TypeScript', 'Ruby', 'Rust', 'Lua', 'Assembly', 'Zig'}
for user in req_data:
    if user['followers'] >= 40 or user['total_stars'] >= 40 or user['total_forks'] >= 40 or user['total_watchers'] >= 40:
        user['label'] = 0
    else:
        if len(user['languages'] & top_tier_languages) >= 4:
            user['label'] = 0
        else:
            if user['repo_count'] >= 10:
                user['label'] = 0
            user['label'] = 1
        user['label'] = 1
print(len([user for user in req_data if user['label']]))

1788


# Converting to dataframes for training

In [11]:
df = pd.DataFrame(req_data)
df.drop(columns=['repo_details']).to_csv('resources/req_dataset.csv', header=True, index=False)

In [12]:
df[['followers', 'total_stars', 'total_forks', 'total_watchers', 'repo_count']]

Unnamed: 0,followers,total_stars,total_forks,total_watchers,repo_count
0,14,84,21,84,21
1,490,56,27,56,30
2,5,0,0,0,3
3,5,8,3,8,12
4,8,13,6,13,9
...,...,...,...,...,...
2995,114,1751,173,1751,25
2996,7,2,0,2,10
2997,13,36,3,36,30
2998,109,36,5,36,30


# Training the model using random forest

In [14]:
X = df[['followers', 'total_stars', 'total_forks', 'total_watchers', 'repo_count']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

unlabeled_data = pd.DataFrame(new_data[9000:])
X_unlabeled = unlabeled_data[['followers', 'total_stars', 'total_forks', 'total_watchers', 'repo_count']]
unlabeled_data['label'] = clf.predict(X_unlabeled)

unlabeled_data.to_csv('resources/labeled_data.csv', index=False)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       220
           1       1.00      1.00      1.00       380

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



# Labeled dataset

In [15]:
new_labeled_data = pd.read_csv('resources/labeled_data.csv')
df = pd.concat([df,new_labeled_data])
df

Unnamed: 0,login,user_url,html_url,repos_url,followers,repo_details,total_stars,total_forks,total_watchers,languages,repo_count,label
0,stiff,https://api.github.com/users/stiff,https://github.com/stiff,https://api.github.com/users/stiff/repos,14,"[{'language': 'Ruby', 'stars': 0, 'forks': 0, ...",84,21,84,"{Python, CoffeeScript, Ruby, JavaScript}",21,0
1,jadonk,https://api.github.com/users/jadonk,https://github.com/jadonk,https://api.github.com/users/jadonk/repos,490,"[{'language': 'JavaScript', 'stars': 3, 'forks...",56,27,56,"{C, C++, PHP, Shell, JavaScript, OpenSCAD, Pyt...",30,0
2,danBerman,https://api.github.com/users/danBerman,https://github.com/danBerman,https://api.github.com/users/danBerman/repos,5,"[{'language': None, 'stars': 0, 'forks': 0, 'w...",0,0,0,{C},3,1
3,afrojas,https://api.github.com/users/afrojas,https://github.com/afrojas,https://api.github.com/users/afrojas/repos,5,"[{'language': 'JavaScript', 'stars': 2, 'forks...",8,3,8,"{Ruby, HTML, JavaScript}",12,1
4,august,https://api.github.com/users/august,https://github.com/august,https://api.github.com/users/august/repos,8,"[{'language': 'JavaScript', 'stars': 0, 'forks...",13,6,13,"{PHP, Objective-C, JavaScript}",9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6546,dzhang,https://api.github.com/users/dzhang,https://github.com/dzhang,https://api.github.com/users/dzhang/repos,3,"[{'language': 'Python', 'stars': 0, 'forks': 0...",0,0,0,"{'Visual Basic', 'Jupyter Notebook', 'Python'}",5,1
6547,gmiroshnykov,https://api.github.com/users/gmiroshnykov,https://github.com/gmiroshnykov,https://api.github.com/users/gmiroshnykov/repos,47,"[{'language': 'Python', 'stars': 0, 'forks': 0...",36,12,36,{'Python'},30,0
6548,ajbrown,https://api.github.com/users/ajbrown,https://github.com/ajbrown,https://api.github.com/users/ajbrown/repos,26,"[{'language': 'SCSS', 'stars': 0, 'forks': 0, ...",106,66,106,"{'Java', 'SCSS', 'HTML', 'Groovy', 'PHP', 'Jav...",30,0
6549,benpflaum,https://api.github.com/users/benpflaum,https://github.com/benpflaum,https://api.github.com/users/benpflaum/repos,3,"[{'language': 'Python', 'stars': 0, 'forks': 0...",0,0,0,{'Python'},1,1


In [143]:
df.to_csv('resources/labeled_data.csv', index=False)


In [146]:
df.loc[df['label'] == 1].shape[0]

10367

# Sample prediction

In [16]:
clf.predict([[4, 0, 0, 0, 13]])



array([1])