# Exploring Data
This notebook is aimed to explore cargo repo data

In [1]:
import pandas
import matplotlib
import seaborn
import collections
import itertools

import sys
sys.path.append('..')

from helper import load_data , load_repo

%matplotlib inline

# Load Cargo

In [2]:
cargo = load_repo('Cargo')

In [3]:
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')]
cargo_git = cargo_git.drop_duplicates(subset='Repository_URL')

# Load Dependencies

In [4]:
packages, dependencies = load_data('Cargo')

In [5]:
packages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82335 entries, 0 to 82336
Data columns (total 3 columns):
package    82335 non-null object
version    82335 non-null object
date       82335 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 2.5+ MB


All unique cargo packages and adding a column as package-version combination

In [6]:
unique_cargo_packages = packages.drop_duplicates(subset='package',keep = 'last')
unique_cargo_packages['pack_ver'] = unique_cargo_packages['package']+unique_cargo_packages['version']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


All dependencies

In [7]:
dependencies['pack_ver'] = dependencies['package']+dependencies['version']
all_unique_dependencies = dependencies[dependencies['pack_ver'].isin(unique_cargo_packages['pack_ver'])]
all_unique_dependencies = all_unique_dependencies.drop('pack_ver',axis=1).reset_index().drop('index',axis=1)

In [8]:
all_unique_dependencies.head(10)

Unnamed: 0,package,version,target,constraint
0,acacia,0.1.2,itertools,^0.5
1,acacia,0.1.2,num,^0.1
2,acacia,0.1.2,rand,^0.3
3,aio,0.0.1,event,*
4,aio,0.0.1,mio,*
5,aio,0.0.1,nix,*
6,advapi32-sys,0.2.0,winapi,^0.2.5
7,alfred,4.0.1,serde_json,^1.0
8,algebloat,0.0.8,algebloat_macros,= 0.0.8
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8


# Related Git URLs
finding git url that each package contributor communicates with 

In [9]:
repos = cargo_git[['Name','Repository_URL']].set_index('Name')
repos.head()

Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
acacia,https://github.com/aepsil0n/acacia
abort_on_panic,https://github.com/emk/abort_on_panic-rs
aio,https://github.com/reem/rust-aio
advapi32-sys,https://github.com/retep998/winapi-rs
alfred,https://github.com/kballard/alfred-rs


In [10]:
ser = all_unique_dependencies['target']
all_unique_dependencies['target_url'] = repos.loc[all_unique_dependencies['target']].reset_index()[['Repository_URL']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


For each package which github repositories data should be retrieved

In [11]:
all_unique_dependencies.head(30)

Unnamed: 0,package,version,target,constraint,target_url
0,acacia,0.1.2,itertools,^0.5,https://github.com/bluss/rust-itertools
1,acacia,0.1.2,num,^0.1,https://github.com/rust-num/num
2,acacia,0.1.2,rand,^0.3,https://github.com/rust-lang-nursery/rand
3,aio,0.0.1,event,*,https://github.com/reem/rust-event
4,aio,0.0.1,mio,*,https://github.com/carllerche/mio
5,aio,0.0.1,nix,*,https://github.com/nix-rust/nix
6,advapi32-sys,0.2.0,winapi,^0.2.5,
7,alfred,4.0.1,serde_json,^1.0,https://github.com/serde-rs/json
8,algebloat,0.0.8,algebloat_macros,= 0.0.8,
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8,


# unique repositories
to find all unique repository addresses

In [12]:
final_repo_list = all_unique_dependencies.dropna(subset=['target_url'], how='all').target_url.unique()

# Method 1 
with subprocess the downside is that we have to clone the repository before running get log

In [None]:
import subprocess

# creates new row of dataframe
def newrow(row,project):
    rowdata ={
        "project_name": project,
        "commit_hash": row[0],
        "commit_date": row[1],
        "author_name": row[2],
        "author_email": row[3],
        "committer_name": row[4],
        "committer_email": row[5] }
    return rowdata
    
# get log of a spesific repo
def get_repo_commits(repo):
    
    #clone repository to a bare repo to work with gitlog
    repo_name = repo.split('/')[3] + repo.split('/')[4]
    commands = ["git", "clone" ,"--bare", repo , '.\\tempclone\\'+repo_name]
    process = subprocess.Popen(commands,stdout=subprocess.PIPE)
    output = process.communicate()[0]

    #get git log result by subprocess
    commands = ["git", "log" , "--pretty=format:%H;-):%ai;-):%an;-):%ae;-):%cn;-):%ce@:#@/"]
    process = subprocess.Popen(commands,  cwd=r'.\\tempclone\\'+repo_name ,stdout=subprocess.PIPE)
    output = process.communicate()[0]
    output = str(output)
    return output

# read commit data from gitlog result and add it to dataframe
def add_to_dataset(commits,project,df):
    for commit in commits.split("@:#@/\\"):
        #if commit length is 0
        if len(commit) == 0:
            return df
        
        #find the main text from git log commit string
        row = commit[2:len(commit)-1]
        
        #split to find the parts and if there is problem with some parts we return 
        rowdata = row.split(';-):')
        if len(rowdata) < 6:
            print(rowdata)
            return df

        #append data to dataframe
        df = df.append(newrow(rowdata,project),ignore_index=True)
        
    return df

# a dataframe to store commit data                                
all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

prevlen = 0

# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    try:
        all_commits = add_to_dataset(get_repo_commits(repo),repo,all_commits)
    except:
        print("except")
        
    print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
    prevlen = len(all_commits)
    
    # pauses process for 2 second although its not necessary
    matplotlib.pyplot.pause(2)
    
    
all_commits.to_csv('../data/commitlogs.csv')

https://github.com/bluss/rust-itertools - 1026 , 1026
https://github.com/rust-num/num - 736 , 1762


# Method 2 
with pygithub the downside is that it takes to much time to (although data for each repository there is just one request for all commit history from Github.com)

In [None]:
from github import Github

g = Github("096ba2a94d1339614995ca0f8b1de94211c34705")

all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

def new_row(commit,repo):
    row = {
        "project_name": repo,
        "commit_hash": commit.sha,
        "commit_date": commit.commit.committer.date , # if commit.commit.committer is not None else '',
        "author_name": commit.author.name, # if commit.author is not None else '',
        "author_email": commit.author.email, # if commit.author is not None else '',
        "committer_name": commit.committer.name, # if commit.committer is not None else '',
        "committer_email": commit.committer.email # if commit.committer is not None else ''
    }
    return row

prevlen = 0
# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    # try to create repo if exist
    try:
        repos = g.get_repo(repo.split("https://github.com/")[1])
    except:
        print('{} -- has problem'.format(repo.split("https://github.com/")[1]))
        continue
        
    print(repos)
    # try to get commits from repo and add to dataframe
    repo_commits = repos.get_commits()
    print(repo_commits.totalCount)
    for commit in repo_commits:
        print(commit)
        all_commits = all_commits.append(new_row(commit,repo),ignore_index=True)
        
    # print sizes
    print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
    prevlen = len(all_commits)

bluss/rust-itertools -- has problem
rust-num/num -- has problem
rust-lang-nursery/rand -- has problem
reem/rust-event -- has problem
carllerche/mio -- has problem
nix-rust/nix -- has problem
serde-rs/json -- has problem
SiegeLord/RustAllegro -- has problem
Jurily/rust-checked-cast -- has problem
rust-lang/libc -- has problem
sfackler/rust-openssl -- has problem
servo/rust-url -- has problem
rust-lang/log -- has problem
sebasmagri/env_logger -- has problem
Antti/rust-amq-proto -- has problem
rust-lang-nursery/lazy-static.rs -- has problem
rust-lang/time -- has problem
rust-num/num-traits -- has problem
SimonSapin/rust-typed-arena -- has problem
rust-lang/rustc-serialize -- has problem
rust-lang-nursery/bitflags -- has problem
reem/stainless -- has problem
BurntSushi/byteorder -- has problem
serde-rs/serde -- has problem
blas-lapack-rs/blas-sys -- has problem
contain-rs/bit-vec -- has problem
iron/iron -- has problem
iron/persistent -- has problem
reem/rust-plugin -- has problem
Stebalie