# Exploring Data
This notebook is aimed to explore cargo repo data

In [1]:
import pandas
import matplotlib
import seaborn
import collections
import itertools
import numpy as np
import warnings
import subprocess


import sys
sys.path.append('..')

from helper import load_data , load_repo
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Cargo

In [2]:
cargo = load_repo('Cargo')

In [3]:
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')]
cargo_git = cargo_git.drop_duplicates(subset='Repository_URL')

# Load Dependencies

In [4]:
packages, dependencies = load_data('Cargo')
dependencies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302834 entries, 0 to 302833
Data columns (total 4 columns):
package       302834 non-null object
version       302834 non-null object
target        302834 non-null object
constraint    302834 non-null object
dtypes: object(4)
memory usage: 11.6+ MB


All unique cargo packages and adding a column as package-version combination

In [5]:
unique_cargo_packages = packages.drop_duplicates(subset='package',keep = 'last')
unique_cargo_packages['pack_ver'] = unique_cargo_packages['package']+unique_cargo_packages['version']
unique_cargo_packages.head(10)

Unnamed: 0,package,version,date,pack_ver
0,acorn,1.0.0,2014-11-21 01:13:02,acorn1.0.0
1,a,0.0.1,2014-11-21 00:06:54,a0.0.1
9,acacia,0.1.2,2017-02-26 16:14:51,acacia0.1.2
19,abort_on_panic,2.0.0,2017-11-29 21:58:03,abort_on_panic2.0.0
20,adamantium,0.0.1,2014-11-21 06:48:26,adamantium0.0.1
21,aio,0.0.1,2015-01-04 21:30:57,aio0.0.1
29,advapi32-sys,0.2.0,2016-02-08 01:05:06,advapi32-sys0.2.0
49,alfred,4.0.1,2017-11-13 19:46:50,alfred4.0.1
62,algebloat,0.0.8,2015-04-17 03:48:15,algebloat0.0.8
75,algebloat_macros,0.0.8,2015-04-17 03:48:07,algebloat_macros0.0.8


All dependencies

In [6]:
dependencies['pack_ver'] = dependencies['package']+dependencies['version']
all_unique_dependencies = dependencies[dependencies['pack_ver'].isin(unique_cargo_packages['pack_ver'])]
all_unique_dependencies = all_unique_dependencies.drop('pack_ver',axis=1).reset_index().drop('index',axis=1)

In [7]:
all_unique_dependencies.head(10)

Unnamed: 0,package,version,target,constraint
0,acacia,0.1.2,itertools,^0.5
1,acacia,0.1.2,num,^0.1
2,acacia,0.1.2,rand,^0.3
3,aio,0.0.1,event,*
4,aio,0.0.1,mio,*
5,aio,0.0.1,nix,*
6,advapi32-sys,0.2.0,winapi,^0.2.5
7,alfred,4.0.1,serde_json,^1.0
8,algebloat,0.0.8,algebloat_macros,= 0.0.8
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8


# Related Git URLs
finding git url that each package contributor communicates with 

In [8]:
repos = cargo_git[['Name','Repository_URL']].set_index('Name')
repos.head()

Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
acacia,https://github.com/aepsil0n/acacia
abort_on_panic,https://github.com/emk/abort_on_panic-rs
aio,https://github.com/reem/rust-aio
advapi32-sys,https://github.com/retep998/winapi-rs
alfred,https://github.com/kballard/alfred-rs


For each package which github repositories data should be retrieved

In [9]:
all_unique_dependencies['target_url'] = repos.loc[all_unique_dependencies['target']].reset_index()[['Repository_URL']]
all_unique_dependencies.head()

Unnamed: 0,package,version,target,constraint,target_url
0,acacia,0.1.2,itertools,^0.5,https://github.com/bluss/rust-itertools
1,acacia,0.1.2,num,^0.1,https://github.com/rust-num/num
2,acacia,0.1.2,rand,^0.3,https://github.com/rust-lang-nursery/rand
3,aio,0.0.1,event,*,https://github.com/reem/rust-event
4,aio,0.0.1,mio,*,https://github.com/carllerche/mio


Dependency origins’ repositories

In [10]:
all_self_repo = repos.loc[all_unique_dependencies['package']].reset_index().Repository_URL.unique()

Dependent package repositories

In [11]:
all_depend_repo = all_unique_dependencies.dropna(subset=['target_url'], how='all').target_url.unique()

# unique repositories
to find all unique repository addresses

In [12]:
all_repos = np.append(all_depend_repo,all_self_repo)
df = pandas.DataFrame(all_repos,columns=['repository'])
final_repo_list = df.drop_duplicates(keep = 'last').dropna().reset_index().drop('index',axis=1).repository.unique()
len(final_repo_list)

7971

# Method 1 
With pygithub the downside is that it takes to much time to (although as I tracked with fiddler, for each repository there is just one request for all commit history from Github.com)

In [None]:
from github import Github

#g = Github("git key")

all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

def new_row(commit,repo):
    row = {
        "project_name": repo,
        "commit_hash": commit.sha,
        "commit_date": commit.commit.committer.date , # if commit.commit.committer is not None else '',
        "author_name": commit.author.name, # if commit.author is not None else '',
        "author_email": commit.author.email, # if commit.author is not None else '',
        "committer_name": commit.committer.name, # if commit.committer is not None else '',
        "committer_email": commit.committer.email # if commit.committer is not None else ''
    }
    return row

prevlen = 0
# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    # try to create repo if exist
    try:
        repos = g.get_repo(repo.split("https://github.com/")[1])
        print(repos)
        # try to get commits from repo and add to dataframe
        repo_commits = repos.get_commits()
        print(repo_commits.totalCount)
        for commit in repo_commits:
            print(commit)
            all_commits = all_commits.append(new_row(commit,repo),ignore_index=True)

        # print sizes
        print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
        prevlen = len(all_commits)
    except:
        print('{} -- has problem'.format(repo.split("https://github.com/")[1]))
        continue

# Method 2 
with subprocess the downside is that we have to clone the repository before running get log

In [18]:
#For first time

# a dataframe to store commit data                                
all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

# a dataframe to store commit data                                
processed_repos = pandas.DataFrame(columns=["project_name"])

In [13]:
#if there is a file and some repos had been processed before

all_commits = pandas.read_csv('../data/commitlogs.csv')
processed_repos = pandas.read_csv('../data/processedrepos.csv')
all_commits.shape

(529975, 7)

In [None]:
# creates new row of dataframe
def newrow(row,project):
    rowdata ={
        "project_name": project,
        "commit_hash": row[0],
        "commit_date": row[1],
        "author_name": row[2],
        "author_email": row[3],
        "committer_name": row[4],
        "committer_email": row[5] }
    return rowdata
    
# get log of a spesific repo
def get_repo_commits(repo):
    
    #clone repository to a bare repo to work with gitlog
    repo_name = repo.split('/')[3] + repo.split('/')[4]
    commands = ["git", "clone" ,"--bare", repo , '.\\tempclone\\'+repo_name]
    process = subprocess.Popen(commands,stdout=subprocess.PIPE)
    output = process.communicate()[0]

    #get git log result by subprocess
    commands = ["git", "log" , '--pretty=format:%H";"%ai";"%an";"%ae";"%cn";"%ce:%#$GLZDH']
    process = subprocess.Popen(commands,  cwd=r'.\\tempclone\\'+repo_name ,stdout=subprocess.PIPE)
    output = process.communicate()[0]
    output = str(output)
    
    return output

# read commit data from gitlog result and add it to dataframe
def add_to_dataset(commits,project,df):
    #find the main text from git log commit string
    commits = commits[2:len(commits)-10]
    for commit in commits.split(':%#$GLZDH\\n'):
        #if commit length is 0
        if len(commit) == 0:
            return df

        #split to find the parts and if there is problem with some parts we return 
        rowdata = commit.split('";"')
        if len(rowdata) < 6:
            print(rowdata)
            return df

        #append data to dataframe
        df = df.append(newrow(rowdata,project),ignore_index=True)
        
    return df

prevlen = 0

# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    if repo in processed_repos.project_name.unique():
        continue
        
    try:
        all_commits = add_to_dataset(get_repo_commits(repo),repo,all_commits)
        processed_repos = processed_repos.append({"project_name": repo},ignore_index=True)
        
        print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
        prevlen = len(all_commits)
    except Exception as e:
        print('Failed to upload to ftp: '+ str(e))
        #print("problem processing repo: " + repo)

Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
Failed to upload to ftp: [WinError 267] The directory name is invalid
https://github.com/cardoe/oxerun - 529975 , 529975
https://github.com/reem/rust-aio - 35 ,