# Exploring Data
This notebook is aimed to explore cargo repo data

In [1]:
import pandas
import matplotlib
import seaborn
import collections
import itertools
import numpy as np
import warnings
import subprocess
import requests


import sys
sys.path.append('..')

from helper import load_data , load_repo , GIT_API
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Cargo

In [2]:
cargo = load_repo('Cargo')

In [3]:
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')]
cargo_git = cargo_git.drop_duplicates(subset='Repository_URL')

# Load Dependencies

In [4]:
packages, dependencies = load_data('Cargo')
dependencies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302834 entries, 0 to 302833
Data columns (total 4 columns):
package       302834 non-null object
version       302834 non-null object
target        302834 non-null object
constraint    302834 non-null object
dtypes: object(4)
memory usage: 11.6+ MB


All unique cargo packages and adding a column as package-version combination

In [5]:
unique_cargo_packages = packages.drop_duplicates(subset='package',keep = 'last')
unique_cargo_packages['pack_ver'] = unique_cargo_packages['package']+unique_cargo_packages['version']
unique_cargo_packages.head(10)

Unnamed: 0,package,version,date,pack_ver
0,acorn,1.0.0,2014-11-21 01:13:02,acorn1.0.0
1,a,0.0.1,2014-11-21 00:06:54,a0.0.1
9,acacia,0.1.2,2017-02-26 16:14:51,acacia0.1.2
19,abort_on_panic,2.0.0,2017-11-29 21:58:03,abort_on_panic2.0.0
20,adamantium,0.0.1,2014-11-21 06:48:26,adamantium0.0.1
21,aio,0.0.1,2015-01-04 21:30:57,aio0.0.1
29,advapi32-sys,0.2.0,2016-02-08 01:05:06,advapi32-sys0.2.0
49,alfred,4.0.1,2017-11-13 19:46:50,alfred4.0.1
62,algebloat,0.0.8,2015-04-17 03:48:15,algebloat0.0.8
75,algebloat_macros,0.0.8,2015-04-17 03:48:07,algebloat_macros0.0.8


All dependencies

In [7]:
dependencies['pack_ver'] = dependencies['package']+dependencies['version']
all_unique_dependencies = dependencies[dependencies['pack_ver'].isin(unique_cargo_packages['pack_ver'])]
all_unique_dependencies = all_unique_dependencies.drop('pack_ver',axis=1).reset_index().drop('index',axis=1)

In [8]:
all_unique_dependencies.head(10)

Unnamed: 0,package,version,target,constraint
0,acacia,0.1.2,itertools,^0.5
1,acacia,0.1.2,num,^0.1
2,acacia,0.1.2,rand,^0.3
3,aio,0.0.1,event,*
4,aio,0.0.1,mio,*
5,aio,0.0.1,nix,*
6,advapi32-sys,0.2.0,winapi,^0.2.5
7,alfred,4.0.1,serde_json,^1.0
8,algebloat,0.0.8,algebloat_macros,= 0.0.8
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8


# Related Git URLs
finding git url that each package contributor communicates with 

In [9]:
repos = cargo_git[['Name','Repository_URL']].set_index('Name')
repos.head()

Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
acacia,https://github.com/aepsil0n/acacia
abort_on_panic,https://github.com/emk/abort_on_panic-rs
aio,https://github.com/reem/rust-aio
advapi32-sys,https://github.com/retep998/winapi-rs
alfred,https://github.com/kballard/alfred-rs


For each package which github repositories data should be retrieved

In [10]:
all_unique_dependencies['target_url'] = repos.loc[all_unique_dependencies['target']].reset_index()[['Repository_URL']]
all_unique_dependencies.head()

Unnamed: 0,package,version,target,constraint,target_url
0,acacia,0.1.2,itertools,^0.5,https://github.com/bluss/rust-itertools
1,acacia,0.1.2,num,^0.1,https://github.com/rust-num/num
2,acacia,0.1.2,rand,^0.3,https://github.com/rust-lang-nursery/rand
3,aio,0.0.1,event,*,https://github.com/reem/rust-event
4,aio,0.0.1,mio,*,https://github.com/carllerche/mio


Dependency origins’ repositories

In [11]:
all_self_repo = repos.loc[all_unique_dependencies['package']].reset_index().Repository_URL.unique()

Dependent package repositories

In [12]:
all_depend_repo = all_unique_dependencies.dropna(subset=['target_url'], how='all').target_url.unique()

# unique repositories
to find all unique repository addresses

In [13]:
all_repos = np.append(all_depend_repo,all_self_repo)
df = pandas.DataFrame(all_repos,columns=['repository'])
final_repo_list = df.drop_duplicates(keep = 'last').dropna().reset_index().drop('index',axis=1).repository.unique()
len(final_repo_list)

7971

# Method 1 
With pygithub the downside is that it takes to much time to (although as I tracked with fiddler, for each repository there is just one request for all commit history from Github.com)

In [23]:
from github import Github

#g = Github("git key")

all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

def new_row(commit,repo):
    row = {
        "project_name": repo,
        "commit_hash": commit.sha,
        "commit_date": commit.commit.committer.date , # if commit.commit.committer is not None else '',
        "author_name": commit.author.name, # if commit.author is not None else '',
        "author_email": commit.author.email, # if commit.author is not None else '',
        "committer_name": commit.committer.name, # if commit.committer is not None else '',
        "committer_email": commit.committer.email # if commit.committer is not None else ''
    }
    return row

prevlen = 0
# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    # try to create repo if exist
    try:
        repos = g.get_repo(repo.split("https://github.com/")[1])
        print(repos)
        # try to get commits from repo and add to dataframe
        repo_commits = repos.get_commits()
        print(repo_commits.totalCount)
        for commit in repo_commits:
            print(commit)
            all_commits = all_commits.append(new_row(commit,repo),ignore_index=True)

        # print sizes
        print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
        prevlen = len(all_commits)
    except:
        print('{} -- has problem'.format(repo.split("https://github.com/")[1]))
        continue

Jurily/rust-checked-cast -- has problem
rust-lang/libc -- has problem
sfackler/rust-openssl -- has problem
rust-num/num-traits -- has problem
SimonSapin/rust-typed-arena -- has problem
rust-lang/rustc-serialize -- has problem
rust-lang-nursery/bitflags -- has problem
reem/stainless -- has problem
BurntSushi/byteorder -- has problem
serde-rs/serde -- has problem
contain-rs/bit-vec -- has problem
rust-builder/buildable -- has problem
unicode-rs/unicode-normalization -- has problem
conduit-rust/route-recognizer.rs -- has problem
SimonSapin/rust-std-candidates -- has problem
alexcrichton/openssl-probe -- has problem
rust-lang/glob -- has problem
chris-morgan/anymap -- has problem
brendanzab/approx -- has problem
dguo/strsim-rs -- has problem
emk/abort_on_panic-rs -- has problem
emk/cesu8-rs -- has problem
tomaka/clock_ticks -- has problem
HeroesGrave/cereal -- has problem
freebroccolo/morphism.rs -- has problem
epsilonz/tailrec.rs -- has problem
epsilonz/monad.rs -- has problem
freebroccol

HdrHistogram/HdrHistogram_rust -- has problem
terminalcloud/thrift -- has problem
ZeroCostGoods/procure -- has problem
ANLAB-KAIST/rust-ilog2 -- has problem
tiffany352/susurrus -- has problem
crumblingstatue/rust-imgur -- has problem
iron/params -- has problem
Kerosene2000/random-wheel-rs -- has problem
jmacdonald/bloodhound -- has problem
ANLAB-KAIST/rust-bitalloc -- has problem
ivanceras/codegenta -- has problem
oli-obk/rust-pandoc -- has problem
softprops/atty -- has problem
SiegeLord/SLRConfig -- has problem
johannhof/text-diff.rs -- has problem
Stebalien/acl-sys -- has problem
Stebalien/xattr -- has problem
rustyhorde/barnacl -- has problem
rustyhorde/barnacl-sys -- has problem
rustyhorde/libgitmask -- has problem
contain-rs/linear-map -- has problem
shepmaster/cupid -- has problem
covertness/coap-rs -- has problem
DoumanAsh/clipboard-win -- has problem
emabee/flexi_logger -- has problem
SimonPersson/ease -- has problem
BurntSushi/chan-signal -- has problem
Stebalien/slug-rs -- ha

ustulation/mio -- has problem
andresilva/cask -- has problem
gimli-rs/gimli -- has problem
gtk-rs/gio -- has problem
badboy/signify-rs -- has problem
tov/succinct-rs -- has problem
muktakosh/unicorn -- has problem
tredoe/combid -- has problem
shssoichiro/iron-valid -- has problem
ashleygwilliams/asserts -- has problem
kaegi/bluetooth-serial-port -- has problem
ashleygwilliams/corroder -- has problem
Rahix/eagre-asn1 -- has problem
jramapuram/hal -- has problem
cfallin/rust-immutable-arena -- has problem
Fiedzia/iron-middleware-mysql -- has problem
casey/j -- has problem
softprops/jamal -- has problem
metafetish/libtrancevibe-rs -- has problem
maidsafe/libsodium_seeded_prng -- has problem
andrewjstone/amy -- has problem
OpenSourceOrg/rust-opensource -- has problem
casey/brev -- has problem
sunjay/lion -- has problem
staktrace/mailparse -- has problem
kornelski/rust-rgb -- has problem
phsym/shrust -- has problem
0nkery/celly -- has problem
inejge/env_proxy -- has problem
dsgriffin/freefa

BenWiederhake/phf_mut -- has problem
sfackler/scheduled-thread-pool -- has problem
oyvindln/gzip-header -- has problem
iopq/gtp-parser-generator -- has problem
nsheremet/getsb-cli -- has problem
phaazon/spectra -- has problem
dongsupark/nsutils-rs -- has problem
lemonrock/bearssl-sys -- has problem
cwoodall/stm32f0xx -- has problem
tdolist/tdo-rs -- has problem
RomanAkberov/smaragd -- has problem
Ruin0x11/caca-rs -- has problem
acmumn/stl-bin-parser -- has problem
slide-rs/hibitset -- has problem
mount-research/sentiment -- has problem
tiziano88/scroll-phat-hd-rs -- has problem
tynril/rgoap -- has problem
evestera/json_typegen -- has problem
SecurityInsanity/text-to-polly-ssml -- has problem
inejge/ldap3 -- has problem
jtomschroeder/hammer -- has problem
Matthew-Maclean/safe-builder-derive -- has problem
PistonDevelopers/split_controller -- has problem
jamespole/rust-wireless -- has problem
yuval-k/opc-rust -- has problem
est31/numbers-rs -- has problem
anthonynguyen/dok -- has problem

nickjer/handlebars_switch -- has problem
repi/shadertoy-browser -- has problem
kchmck/cai_cyclic.rs -- has problem
kchmck/moving_avg.rs -- has problem
m4b/lazy_transducer -- has problem
aaron-lebo/amigo -- has problem
totem3/ofuton -- has problem
rustyhorde/repomon -- has problem
kpcyrd/tr1pd -- has problem
zovt/anne -- has problem
tversteeg/rocket-game -- has problem
kchmck/iq_osc.rs -- has problem
asonix/tokio-zmq -- has problem
mwylde/rumble -- has problem
Mic92/cntr-nix -- has problem
Mic92/cntr-fuse -- has problem
quadrupleslap/x264 -- has problem
jit-y/spreadsheet_textconv -- has problem
lakelezz/hey_listen -- has problem
ensc/unix-fd -- has problem
Debily/fuzzmutator-rs -- has problem
trezm/fanta -- has problem
anglerud/hootie -- has problem
rust-dsp/rust-vst -- has problem
Pistahh/ostdl -- has problem
fschutt/simd-runtime-check-x64 -- has problem
etairi/sidh-rs -- has problem
threatstack/shush -- has problem
adwhit/diesel-derive-enum -- has problem
grahame/dividebatur2 -- has p

# Method 2 
with subprocess the downside is that we have to clone the repository before running get log

In [25]:
#For first time

# a dataframe to store commit data                                
all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

# a dataframe to store commit data                                
processed_repos = pandas.DataFrame(columns=["project_name"])
not_processed_repos = pandas.DataFrame(columns=["project_name"])

In [26]:
#if there is a file and some repos had been processed before

all_commits = pandas.read_csv('../data/commitlogs1.csv.gz')
processed_repos = pandas.read_csv('../data/processedrepos.csv')
#not_processed_repos = pandas.read_csv('../data/notprocessedrepos.csv')
all_commits.shape

(925649, 7)

In [27]:
# creates new row of dataframe
def newrow(row,project):
    rowdata ={
        "project_name": project,
        "commit_hash": row[0],
        "commit_date": row[1],
        "author_name": row[2],
        "author_email": row[3],
        "committer_name": row[4],
        "committer_email": row[5] }
    return rowdata
    
# get log of a spesific repo
def get_repo_commits(repo):
    
    #clone repository to a bare repo to work with gitlog
    repo_name = repo.split('/')[3] + repo.split('/')[4]
    commands = ["git", "clone" ,"--bare", repo , '.\\tempclone\\'+repo_name]
    process = subprocess.Popen(commands,stdout=subprocess.PIPE)
    output = process.communicate()[0]

    #get git log result by subprocess
    commands = ["git", "log" , '--pretty=format:%H";"%ai";"%an";"%ae";"%cn";"%ce:%#$GLZDH']
    process = subprocess.Popen(commands,  cwd=r'.\\tempclone\\'+repo_name ,stdout=subprocess.PIPE)
    output = process.communicate()[0]
    output = str(output)
    
    return output

# read commit data from gitlog result and add it to dataframe
def add_to_dataset(commits,project,df):
    #find the main text from git log commit string
    commits = commits[2:len(commits)-10]
    for commit in commits.split(':%#$GLZDH\\n'):
        #if commit length is 0
        if len(commit) == 0:
            return df

        #split to find the parts and if there is problem with some parts we return 
        rowdata = commit.split('";"')
        if len(rowdata) < 6:
            print(rowdata)
            return df

        #append data to dataframe
        df = df.append(newrow(rowdata,project),ignore_index=True)
        
    return df

prevlen = 0

# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    if repo in processed_repos.project_name.unique():
        continue
        
    try:
        all_commits = add_to_dataset(get_repo_commits(repo),repo,all_commits)
        processed_repos = processed_repos.append({"project_name": repo},ignore_index=True)
        
        print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
        prevlen = len(all_commits)
    except Exception as e:
        not_processed_repos  = not_processed_repos.append({"project_name": repo},ignore_index=True)
        print('problem fetching : ' + repo)
        #print("problem processing repo: " + repo)

problem fetching : https://github.com/Jurily/rust-checked-cast
problem fetching : https://github.com/daggerbot/mm_math
problem fetching : https://github.com/qrlpx/qdowncast
problem fetching : https://github.com/alfiedotwtf/file-lock
problem fetching : https://github.com/pixel27/shareable
problem fetching : https://github.com/LinusU/rust-emoji-commit-type
problem fetching : https://github.com/clarcharr/extra-default
problem fetching : https://github.com/Daggerbot/aurum
problem fetching : https://github.com/kchmck/subslice_index
problem fetching : https://github.com/TeXitoi/par-iterator-type
problem fetching : https://github.com/WatchDG/rust-converter
problem fetching : https://github.com/ZakCodes/fast_io
problem fetching : https://github.com/purpliminal/rust-iterslide
problem fetching : https://github.com/Jurily/rust-allocator
problem fetching : https://github.com/purpliminal/rust-dotenv
problem fetching : https://github.com/jxny/julius-rs
problem fetching : https://github.com/jemcroft/

problem fetching : https://github.com/vtduncan/canonical_json
problem fetching : https://github.com/clarcharr/multistr
problem fetching : https://github.com/andete/philips_hue_client
problem fetching : https://github.com/lambdastackio/http2hpack
problem fetching : https://github.com/fwrs/sigil
problem fetching : https://github.com/kdy1997/must
problem fetching : https://github.com/clarcharr/parse-hosts
problem fetching : https://github.com/clarcharr/swc-hosts
problem fetching : https://github.com/liamstask/exif-rs
problem fetching : https://github.com/afonso360/discogs-rs
problem fetching : https://github.com/sebasgarcep/crates-api
problem fetching : https://github.com/heartsh/cpr
problem fetching : https://github.com/kofron/ophir-rs
problem fetching : https://github.com/quadrupleslap/vice
problem fetching : https://github.com/nfjinjing/mtcp
problem fetching : https://github.com/ramn/currency-rs
problem fetching : https://github.com/AlexanderThaller/csvstore
problem fetching : https://

https://github.com/crawford/efm32gg11b820 - 15 , 897783
https://github.com/gnzlbg/slice_deque - 145 , 897928
https://github.com/phil-opp/bootimage - 114 , 898042
https://github.com/chocol4te/stm32f411xx - 2 , 898044
https://github.com/ioncodes/sam - 65 , 898109
https://github.com/valeriansaliou/vigil - 122 , 898231
https://github.com/Erk-/spotrust - 22 , 898253
https://github.com/1aim/rust-efr32x12p - 4 , 898257
https://github.com/germangb/steam-audio-sys - 27 , 898284
https://github.com/azuqua/wombo.rs - 10 , 898294
https://github.com/1aim/rust-efr32xg12p - 4 , 898298
https://github.com/jsonnull/rust-wasm - 32 , 898330
https://github.com/easy-semver/calcver-rs - 41 , 898371
https://github.com/sharkdp/hyperfine - 161 , 898532
https://github.com/FauxFaux/fapt - 268 , 898800
https://github.com/rusticata/x509-parser - 39 , 898839
https://github.com/dcrewi/typenum-prime-rs - 7 , 898846
https://github.com/astro/tokio-xmpp - 89 , 898935
problem fetching : https://github.com/rhysd/uni
https:/

https://github.com/rando-rs/rando.rs - 14 , 907505
https://github.com/vitiral/termstyle - 7 , 907512
https://github.com/kyledunne/rug2d - 22 , 907534
https://github.com/sile/fibers_inotify - 17 , 907551
https://github.com/vvilhonen/hyper-socks-async - 13 , 907564
https://github.com/awestlake87/sc2-rs - 293 , 907857
https://github.com/kdy1/rust-pmutil - 9 , 907866
problem fetching : https://github.com/stephaneyfx/enum-iterator-derive
https://github.com/strake/ft2.rs - 372 , 908238
https://github.com/fitzgen/wasm-snip - 35 , 908273
https://github.com/d-e-s-o/dictcc-cli - 34 , 908307
https://github.com/calavera/netlify-toml-rs - 2 , 908309
https://github.com/valarauca/system_dns - 3 , 908312
https://github.com/nabijaczleweli/registry.pol-rs - 11 , 908323
https://github.com/manuel-rhdt/harfbuzz_rs - 101 , 908424
https://github.com/sgeisler/cargo-remote - 26 , 908450
https://github.com/Patryk27/pwr-airly - 17 , 908467
https://github.com/despawnerer/truecase - 51 , 908518
https://github.com/

https://github.com/Tommoa/rs-ipc - 5 , 919672
https://github.com/polachok/iata-types - 10 , 919682
https://github.com/andete/quick_atom - 9 , 919691
https://github.com/polachok/sirena-types - 7 , 919698
https://github.com/jamesmunns/nrf52-hal - 31 , 919729
https://github.com/fredrikroos/graf - 5 , 919734
https://github.com/vityafx/serde-aux - 14 , 919748
https://github.com/Pirh/dymod - 10 , 919758
https://github.com/mgattozzi/lift-fail - 1 , 919759
https://github.com/kpcyrd/rshijack - 27 , 919786
https://github.com/csssuf/pretty-good - 69 , 919855
https://github.com/DuckLogic/TwoSidedVec - 9 , 919864
https://github.com/someguynamedmatt/initial_conditions - 65 , 919929
https://github.com/advancedresearch/pocket_prover-set - 9 , 919938
https://github.com/AprliRainkun/copra - 60 , 919998
https://github.com/rust-bio/rust-bio-tools - 91 , 920089
https://github.com/benashford/lwactors - 15 , 920104
https://github.com/OneSignal/zk-4lw - 2 , 920106
https://github.com/coderbot16/cemconv - 11 , 

In [16]:
ds = all_unique_dependencies.groupby('target').size().reset_index(name='counts').sort_values('counts').tail(100)[::-1]
all_unique_dependencies[all_unique_dependencies['target'].isin(ds['target'])].groupby('target_url').size().reset_index(name='counts').sort_values('counts').tail(100)[::-1].to_csv('../data/selected_repos.csv')

In [76]:
selected = all_commits.groupby('project_name').size().reset_index(name='counts').sort_values('counts').tail(100)[::-1]
selected = all_unique_dependencies[all_unique_dependencies['target'].isin(ds['target'])].groupby('target_url').size().reset_index(name='counts').sort_values('counts').tail(100)[::-1]