In [1]:
import json
import numpy
import re
import os
import numpy as np
import gensim
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from gensim.models import Doc2Vec

## Load data

In [2]:
input_file ='../data/result_13k.json'
with open(input_file) as f:
    data = json.load(f)
print(f'repos nums: {len(data)}')

need_to_remove = []
for k,v in data.items():
    if 'No dependency' in v:
        need_to_remove.append(k)
print(f'repos which have no dependency files: {len(need_to_remove)}')

for k in need_to_remove:
    del data[k]
print(f'repos with dependency files: {len(data)}')

rep_list,dep_list = [],[]
for k,v in data.items():
    rep_list.append(k)
    dep_list.append(v)
    
dep_dict = {}
for deps in data.values():
    for i in deps:
        dep_dict[i] = dep_dict.get(i,0)+1

print(f'Distinct dependency file: {len(dep_dict)}',end='\n\n')

repos nums: 3012
repos which have no dependency files: 0
repos with dependency files: 3012
Distinct dependency file: 15663



In [4]:
dep_list,len(dep_list)

([['ipython',
   'jupyter-sphinx',
   'nbformat',
   'nbsphinx',
   'path-py',
   'six',
   'sphinx',
   'sphinx-hoverxref',
   'sphinx-rtd-theme'],
  ['pypng', 'requests'],
  ['bleach',
   'click',
   'docutils',
   'joblib',
   'keras',
   'matplotlib',
   'numpy',
   'opencv-python',
   'pandas',
   'pillow',
   'pkginfo',
   'plotly',
   'pygments',
   'scikit-image',
   'scipy',
   'seaborn',
   'setuptools',
   'tensorboard',
   'tensorflow',
   'tqdm'],
  ['numpy'],
  ['biopython'],
  ['libconfig',
   'matplotlib',
   'numpy',
   'pandas',
   'rstoolbox',
   'scipy',
   'seaborn',
   'transforms3d'],
  ['argparse',
   'astropy',
   'matplotlib',
   'mwa-pb',
   'mwa-voltage',
   'numpy',
   'psrqpy',
   'pytest'],
  ['coverage',
   'mkdocs',
   'mkdocs-material',
   'tox',
   'loguru',
   'mypy',
   'pyrallel-lib',
   'rdflib',
   'redis',
   'rltk',
   'seaborn',
   'sentence-transformers',
   'simplejson',
   'sklearn',
   'sparqlwrapper',
   'tqdm',
   'attrs',
   'etk',
   '

## Doc Embedding

In [5]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for em in dep_list:
    all_content_train.append(LabeledSentence1(em,[j]))
    j+=1
print('Number of repos processed: ', j)

Number of repos processed:  3012


In [6]:
d2v_model = Doc2Vec(all_content_train, 
                    size = 100, 
                    window = 10, 
                    min_count = 1, 
                    workers=7, 
                    dm = 1,
                    alpha=0.025, 
                    min_alpha=0.001)
d2v_model.train(all_content_train, 
                total_examples=d2v_model.corpus_count, 
                epochs=10, 
                start_alpha=0.002, 
                end_alpha=-0.016)

d2v_model[0]



array([ 0.00458806,  0.03768101,  0.01253925, -0.04197038, -0.02798572,
        0.02302323, -0.03229389, -0.0131154 , -0.06217861, -0.00044862,
        0.0476163 , -0.0166369 ,  0.00543086,  0.01682888,  0.01729672,
        0.00422281,  0.03290429,  0.00039571,  0.02714512, -0.00975056,
       -0.0057607 ,  0.04694798,  0.0007891 , -0.00273268,  0.04533802,
        0.05385239, -0.02966216,  0.01982142, -0.00948914,  0.00283305,
       -0.02114621, -0.00951162, -0.00365996, -0.0178862 ,  0.05752014,
       -0.01462043, -0.02994952,  0.0020511 ,  0.03393682, -0.00038157,
        0.01067804,  0.01912075,  0.01222714,  0.0031634 , -0.00867806,
       -0.04176655, -0.01957559, -0.02376308, -0.01774316,  0.03373904,
       -0.0553568 , -0.00011773,  0.03944528,  0.0190468 ,  0.00881731,
       -0.00263658, -0.0203399 ,  0.00198226, -0.0179656 ,  0.00077303,
        0.0376535 , -0.0106577 ,  0.03003998,  0.00855911, -0.02845825,
        0.00961545,  0.03849403,  0.0047895 ,  0.01076723,  0.04

## Clustering - Kmeans

In [15]:
kmeans_model = KMeans(n_clusters=10, init='k-means++', max_iter=500) 
X = kmeans_model.fit(d2v_model.docvecs.doctag_syn0)
labels=kmeans_model.labels_

  


In [16]:
topic_dict = {}
for index,label in enumerate(labels):
    topic_id = label
    print(topic_id, '--->', rep_list[index])
    
    topic_dict[label] = topic_dict.get(label,[])
    topic_dict[label].append(rep_list[index])

1 ---> https://github.com/AgriculturalModelExchangeInitiative/Crop2ML
1 ---> https://github.com/natcap/natgeo-dams
1 ---> https://github.com/hlgirard/CrystalML
1 ---> https://github.com/houghb/ligpy
1 ---> https://github.com/oschwengers/referenceseeker
1 ---> https://github.com/LPDI-EPFL/trivalent_cocktail
1 ---> https://github.com/CIRA-Pulsars-and-Transients-Group/vcstools
5 ---> https://github.com/usc-isi-i2/kgtk
1 ---> https://github.com/garciagenrique/template_project_escape
8 ---> https://github.com/gwu-libraries/sfm-docker
5 ---> https://github.com/javaparser/javaparser
1 ---> https://github.com/williamjameshandley/anesthetic
1 ---> https://github.com/yardencsGitHub/tweetynet
1 ---> https://github.com/jagalindo/A-Python-QX-implementation
1 ---> https://github.com/jbkinney/mavenn
1 ---> https://github.com/GeoCode-polymtl/Seis_float16
1 ---> https://github.com/sebp/scikit-survival
1 ---> https://github.com/similitude/sumo-simmer
8 ---> https://github.com/GlobalNamesArchitecture/gnr

4 ---> https://github.com/machawk1/warcreate
1 ---> https://github.com/materialsvirtuallab/monty
1 ---> https://github.com/kgullikson88/Telluric-Fitter
1 ---> https://github.com/Urban-Meteorology-Reading/SUEWS
1 ---> https://github.com/serazing/xscale
1 ---> https://github.com/pauleve/mpbn
1 ---> https://github.com/ericmjl/flu-gibson
8 ---> https://github.com/MoritzStefaner/ach-ingen-zell
0 ---> https://github.com/MAGIC-nexus/nis-backend
1 ---> https://github.com/pycalphad/scheil
1 ---> https://github.com/vanheeringen-lab/genomepy
1 ---> https://github.com/deparkes/OOMMFTools
1 ---> https://github.com/sarisabban/RamaNet
1 ---> https://github.com/nano-sippe/dispersion
1 ---> https://github.com/hallamlab/pathway2vec
1 ---> https://github.com/SCM-NV/nano-qmflows
1 ---> https://github.com/bburan/NeuroBehavior
1 ---> https://github.com/PhenixCollaboration/web
1 ---> https://github.com/Bubblbu/crawling-framework
1 ---> https://github.com/smarr/ReBench
1 ---> https://github.com/Alerovere/Pale

1 ---> https://github.com/matt-long/xpersist
1 ---> https://github.com/bast/smeshing
9 ---> https://github.com/Scifabric/pybossa
1 ---> https://github.com/ds-wizard/docs
1 ---> https://github.com/abelcarreras/phonolammps
1 ---> https://github.com/jason-zl190/sisr_medical
1 ---> https://github.com/mdshw5/pyfaidx
8 ---> https://github.com/wiebket/delprocess
1 ---> https://github.com/holoviz/datashader
1 ---> https://github.com/ISA-tools/mzml2isa
1 ---> https://github.com/ajefweiss/HelioSat
1 ---> https://github.com/houghb/savvy
1 ---> https://github.com/fnl/gnamed
1 ---> https://github.com/jjgomera/iapws
1 ---> https://github.com/dicom/rtp-connect
8 ---> https://github.com/moonso/genmod
1 ---> https://github.com/zafarali/emdp
1 ---> https://github.com/phydev/trajpy
1 ---> https://github.com/IMMM-SFA/im3py
1 ---> https://github.com/yadage/yadage-schemas
1 ---> https://github.com/karenadam/Mixed-Bandlimited-Time-Encoding
1 ---> https://github.com/spacetelescope/gwcs
1 ---> https://github.c

1 ---> https://github.com/scikit-hep/boost-histogram
0 ---> https://github.com/vsoch/askci
1 ---> https://github.com/boutiques/boutiques
1 ---> https://github.com/nansencenter/nansat
1 ---> https://github.com/cdanielmachado/reframed
1 ---> https://github.com/LightForm-group/matflow
1 ---> https://github.com/Capitains/Hook
1 ---> https://github.com/arokem/ISBI2015
1 ---> https://github.com/a-slide/NanoSnake
9 ---> https://github.com/laplizard/infoplot
1 ---> https://github.com/hugadams/PAME
1 ---> https://github.com/torressa/cspy
1 ---> https://github.com/ConservationInternational/trends.earth
1 ---> https://github.com/OpenChemistry/avogadrolibs
1 ---> https://github.com/ondrolexa/pywerami
9 ---> https://github.com/Sulstice/datacity
1 ---> https://github.com/bootphon/phonemizer
1 ---> https://github.com/chrisgorgo/alleninf
8 ---> https://github.com/opentox/lazar-rest
6 ---> https://github.com/SeqWare/seqware
1 ---> https://github.com/msmbuilder/osprey
8 ---> https://github.com/urschrei/

1 ---> https://github.com/PySCeS/pysces
1 ---> https://github.com/DamCB/tyssue
8 ---> https://github.com/lsmo-epfl/discover-curated-cofs
1 ---> https://github.com/caltechlibrary/eprints2bags
1 ---> https://github.com/proycon/foliapy
5 ---> https://github.com/bird-house/twitcher
8 ---> https://github.com/ganga-devs/ganga
1 ---> https://github.com/kotik-coder/PULsE
1 ---> https://github.com/mozillazg/pinyin-data
1 ---> https://github.com/MicroPasts/EgyptExplorationSocBuhenPottery
1 ---> https://github.com/luispedro/imread
5 ---> https://github.com/PCMSolver/pcmsolver
8 ---> https://github.com/klout/brickhouse
2 ---> https://github.com/Dash-Industry-Forum/dash.js
1 ---> https://github.com/ProjectDrawdown/spatial-aez
2 ---> https://github.com/cytoscape/cytoscape.js-popper
8 ---> https://github.com/ecohealthalliance/pubcrawler
1 ---> https://github.com/underworldcode/stripy
1 ---> https://github.com/pycroscopy/pyUSID
1 ---> https://github.com/jjnp/dss20-ue1
8 ---> https://github.com/pytroll

8 ---> https://github.com/rinde/pdptw-dataset-generator
8 ---> https://github.com/CambridgeSemiticsLab/BH_time_collocations
1 ---> https://github.com/erwinkendo/polaruob
8 ---> https://github.com/geneontology/obographs
5 ---> https://github.com/speckleworks/SpeckleCore
1 ---> https://github.com/rjw57/videosequence
1 ---> https://github.com/tylerjereddy/diffusion_analysis_MD_simulations
5 ---> https://github.com/luphysics/PyMODA
1 ---> https://github.com/NatLibFi/Annif
1 ---> https://github.com/adbar/trafilatura
1 ---> https://github.com/ForeverZyh/DEBAR
8 ---> https://github.com/danchubb/CanVar
1 ---> https://github.com/JonathonMSmith/growin
1 ---> https://github.com/SoftwareDevEngResearch/flexWecDesignOpt
1 ---> https://github.com/RubenImhoff/Large_Sample_Nowcasting_Evaluation
5 ---> https://github.com/TreeCmp/TreeCmpWEB
1 ---> https://github.com/TaufiqHassan/acccmip6
1 ---> https://github.com/clsb/miles
8 ---> https://github.com/kip-hart/MicroStructPy
1 ---> https://github.com/hls-fp

In [69]:
for k in sorted(topic_dict.keys()):
    print(f'topic {k} : repos num: {len(topic_dict[k])}')

topic 0 : repos num: 2004
topic 1 : repos num: 59
topic 2 : repos num: 37
topic 3 : repos num: 118
topic 4 : repos num: 3
topic 5 : repos num: 202
topic 6 : repos num: 83
topic 7 : repos num: 11
topic 8 : repos num: 36
topic 9 : repos num: 459


## Clustering - GMM

In [82]:
GMM = GaussianMixture(n_components=10).fit(d2v_model.docvecs.doctag_syn0)
probs = GMM.predict_proba(d2v_model.docvecs.doctag_syn0)

probs.shape,probs

  """Entry point for launching an IPython kernel.
  


((3012, 10),
 array([[3.83853395e-005, 1.60064351e-253, 0.00000000e+000, ...,
         5.89829455e-014, 0.00000000e+000, 9.99961615e-001],
        [7.79539839e-006, 2.16666431e-252, 0.00000000e+000, ...,
         1.78288693e-012, 0.00000000e+000, 9.99992205e-001],
        [9.67671120e-006, 0.00000000e+000, 0.00000000e+000, ...,
         4.93476052e-016, 0.00000000e+000, 9.99990323e-001],
        ...,
        [4.93449671e-005, 0.00000000e+000, 0.00000000e+000, ...,
         1.54121993e-014, 0.00000000e+000, 9.99950655e-001],
        [9.69154455e-007, 5.86529082e-279, 0.00000000e+000, ...,
         1.22051508e-017, 0.00000000e+000, 9.99999031e-001],
        [3.26109095e-007, 1.39024139e-312, 0.00000000e+000, ...,
         2.73686617e-019, 0.00000000e+000, 9.99999674e-001]]))