# Exploring Data
This notebook is aimed to explore cargo repo data

In [1]:
import pandas
import matplotlib
import seaborn
import collections
import itertools

import sys
sys.path.append('..')

from helper import load_data , load_repo

%matplotlib inline

# Load Cargo

In [2]:
cargo = load_repo('Cargo')

In [3]:
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')]
cargo_git = cargo_git.drop_duplicates(subset='Repository_URL')

# Load Dependencies

In [4]:
packages, dependencies = load_data('Cargo')

In [5]:
packages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82335 entries, 0 to 82336
Data columns (total 3 columns):
package    82335 non-null object
version    82335 non-null object
date       82335 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 2.5+ MB


All unique cargo packages and adding a column as package-version combination

In [6]:
unique_cargo_packages = packages.drop_duplicates(subset='package',keep = 'last')
unique_cargo_packages['pack_ver'] = unique_cargo_packages['package']+unique_cargo_packages['version']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


All dependencies

In [7]:
dependencies['pack_ver'] = dependencies['package']+dependencies['version']
all_unique_dependencies = dependencies[dependencies['pack_ver'].isin(unique_cargo_packages['pack_ver'])]
all_unique_dependencies = all_unique_dependencies.drop('pack_ver',axis=1).reset_index().drop('index',axis=1)

In [8]:
all_unique_dependencies.head(10)

Unnamed: 0,package,version,target,constraint
0,acacia,0.1.2,itertools,^0.5
1,acacia,0.1.2,num,^0.1
2,acacia,0.1.2,rand,^0.3
3,aio,0.0.1,event,*
4,aio,0.0.1,mio,*
5,aio,0.0.1,nix,*
6,advapi32-sys,0.2.0,winapi,^0.2.5
7,alfred,4.0.1,serde_json,^1.0
8,algebloat,0.0.8,algebloat_macros,= 0.0.8
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8


# Related Git URLs
finding git url that each package contributor communicates with 

In [9]:
repos = cargo_git[['Name','Repository_URL']].set_index('Name')
repos.head()

Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
acacia,https://github.com/aepsil0n/acacia
abort_on_panic,https://github.com/emk/abort_on_panic-rs
aio,https://github.com/reem/rust-aio
advapi32-sys,https://github.com/retep998/winapi-rs
alfred,https://github.com/kballard/alfred-rs


In [10]:
ser = all_unique_dependencies['target']
all_unique_dependencies['target_url'] = repos.loc[all_unique_dependencies['target']].reset_index()[['Repository_URL']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


For each package which github repositories data should be retrieved

In [11]:
all_unique_dependencies.head(30)

Unnamed: 0,package,version,target,constraint,target_url
0,acacia,0.1.2,itertools,^0.5,https://github.com/bluss/rust-itertools
1,acacia,0.1.2,num,^0.1,https://github.com/rust-num/num
2,acacia,0.1.2,rand,^0.3,https://github.com/rust-lang-nursery/rand
3,aio,0.0.1,event,*,https://github.com/reem/rust-event
4,aio,0.0.1,mio,*,https://github.com/carllerche/mio
5,aio,0.0.1,nix,*,https://github.com/nix-rust/nix
6,advapi32-sys,0.2.0,winapi,^0.2.5,
7,alfred,4.0.1,serde_json,^1.0,https://github.com/serde-rs/json
8,algebloat,0.0.8,algebloat_macros,= 0.0.8,
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8,


# unique repositories
to find all unique repository addresses

In [12]:
final_repo_list = all_unique_dependencies.dropna(subset=['target_url'], how='all').target_url.unique()

# Method 1 
With pygithub the downside is that it takes to much time to (although as I tracked with fiddler, for each repository there is just one request for all commit history from Github.com)

In [None]:
from github import Github

g = Github("f01369c3dd13beb4d69f9ee115f9835e30002d67")

all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

def new_row(commit,repo):
    row = {
        "project_name": repo,
        "commit_hash": commit.sha,
        "commit_date": commit.commit.committer.date , # if commit.commit.committer is not None else '',
        "author_name": commit.author.name, # if commit.author is not None else '',
        "author_email": commit.author.email, # if commit.author is not None else '',
        "committer_name": commit.committer.name, # if commit.committer is not None else '',
        "committer_email": commit.committer.email # if commit.committer is not None else ''
    }
    return row

prevlen = 0
# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    # try to create repo if exist
    try:
        repos = g.get_repo(repo.split("https://github.com/")[1])
    except:
        print('{} -- has problem'.format(repo.split("https://github.com/")[1]))
        continue
        
    print(repos)
    # try to get commits from repo and add to dataframe
    repo_commits = repos.get_commits()
    print(repo_commits.totalCount)
    for commit in repo_commits:
        print(commit)
        all_commits = all_commits.append(new_row(commit,repo),ignore_index=True)
        
    # print sizes
    print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
    prevlen = len(all_commits)

## Method 2 
with subprocess the downside is that we have to clone the repository before running get log

In [13]:
import subprocess

# creates new row of dataframe
def newrow(row,project):
    rowdata ={
        "project_name": project,
        "commit_hash": row[0],
        "commit_date": row[1],
        "author_name": row[2],
        "author_email": row[3],
        "committer_name": row[4],
        "committer_email": row[5] }
    return rowdata
    
# get log of a spesific repo
def get_repo_commits(repo):
    
    #clone repository to a bare repo to work with gitlog
    repo_name = repo.split('/')[3] + repo.split('/')[4]
    commands = ["git", "clone" ,"--bare", repo , '.\\tempclone\\'+repo_name]
    process = subprocess.Popen(commands,stdout=subprocess.PIPE)
    output = process.communicate()[0]

    #get git log result by subprocess
    commands = ["git", "log" , '--pretty=format:%H";"%ai";"%an";"%ae";"%cn";"%ce:%#$GLZDH']
    process = subprocess.Popen(commands,  cwd=r'.\\tempclone\\'+repo_name ,stdout=subprocess.PIPE)
    output = process.communicate()[0]
    output = str(output)
    
    return output

# read commit data from gitlog result and add it to dataframe
def add_to_dataset(commits,project,df):
    #find the main text from git log commit string
    commits = commits[2:len(commits)-10]
    for commit in commits.split(':%#$GLZDH\\n'):
        #if commit length is 0
        if len(commit) == 0:
            return df

        #split to find the parts and if there is problem with some parts we return 
        rowdata = commit.split('";"')
        if len(rowdata) < 6:
            print(rowdata)
            return df

        #append data to dataframe
        df = df.append(newrow(rowdata,project),ignore_index=True)
        
    return df

# a dataframe to store commit data                                
all_commits = pandas.DataFrame(columns=["project_name","commit_hash","commit_date","author_name","author_email","committer_name","committer_email"])

prevlen = 0

# iterating all repositories to retrieve commit history
for repo in final_repo_list:
    try:
        all_commits = add_to_dataset(get_repo_commits(repo),repo,all_commits)
    except:
        print("except")
        
    print('{} - {} , {}'.format(repo , (len(all_commits) - prevlen), len(all_commits)))
    prevlen = len(all_commits)
    
    if len(all_commits) >4000:
        break
        
    # pauses process for 2 second although its not necessary
    matplotlib.pyplot.pause(2)
    

https://github.com/bluss/rust-itertools - 1026 , 1026
https://github.com/rust-num/num - 736 , 1762
https://github.com/rust-lang-nursery/rand - 1860 , 3622
https://github.com/reem/rust-event - 59 , 3681
https://github.com/carllerche/mio - 664 , 4345


In [None]:
all_commits.head()
all_commits.to_csv('../data/commitlogs.csv')

In [None]:
import requests
all_comments = pandas.DataFrame(columns=["commit_hash","user_name","user_type","comment_author_assoc","comment_created_at"])
not_processed = list()

In [None]:
for i in range(0,len(all_commits)):
    
    if(all_commits.loc[i].commit_hash in all_comments.commit_hash):
        continue
    
    # Create comment url + client info to increase the request limit
    comment_url = 'https://api.github.com/repos/{}/{}/commits/{}/comments?client_id={}&client_secret={}'.format(
        all_commits.loc[i].project_name.split('/')[3] ,
        all_commits.loc[i].project_name.split('/')[4] ,
        all_commits.loc[i].commit_hash ,
        'ClientID' , 'ClientSecret')
    
    # try to get request
    try:
        req = requests.get(comment_url)
        print(req.headers['X-RateLimit-Remaining'])
    except:
        not_processed.append(comment_url)
    
    # if request was successful read data
    data = req.json()
    
    # if data is empty
    if data == []:
        all_comments = all_comments.append({
            "commit_hash": all_commits.loc[i].commit_hash,
            "user_name": '',
            "user_type": '',
            "comment_author_assoc": '',
            "comment_created_at": '' },ignore_index=True)
    else:
        for comment in data:
            all_comments = all_comments.append({
                "commit_hash": comment['commit_id'],
                "user_name": comment['user']['login'],
                "user_type": comment['user']['type'],
                "comment_author_assoc": comment['author_association'],
                "comment_created_at": comment['created_at']},ignore_index=True)
    
    print(len(all_comments))
    # pauses process for 2 second
    matplotlib.pyplot.pause(2)



4937
10
4936
11
4935
12
4934
13
4933
14
4932
15
4931
16
4930
17
4929
18
4928
19
4927
20
4926
21
4925
22
4924
23
4923
24
4922
25
4921
26
4920
27
4919
28
4918
29
4917
30
4916
31
4915
32
4914
33
4913
34
4912
35
4911
36
4910
37
4909
38
4908
39
4907
40
4906
41
4905
42
4904
43
4903
44
4902
45
4901
46
4900
47
4899
48
4898
49
4897
50
4896
51
4895
52
4894
53
4893
54
4892
55
4891
56
4890
57
4889
58
4888
59
4887
60
4886
61
4885
62
4884
63
4883
64
4882
65
4881
66
4880
67
4879
68
4878
69
4877
70
4876
71
4875
72
4874
73
4873
74
4872
75
4871
76
4870
77
4869
78
4868
79
4867
80
4866
81
4865
82
4864
83
4863
84
4862
85
4861
86
4860
87
4859
88
4858
89
4857
90
4856
91
4855
92
4854
93
4853
94
4852
95
4851
96
4850
97
4999
98
4998
99
4997
100
4996
101
4995
102
4994
103
4993
104
4992
105
4991
106
4990
107
4989
108
4988
109
4987
110
4986
111
4985
112
4984
113
4983
114
4982
115
4981
116
4980
117
4979
118
4978
119
4977
120
4976
121
4975
122
4974
123
4973
124
4972
125
4971
126
4970
127
4969
128
4968
129
4967
130
4

In [21]:
all_comments

Unnamed: 0,commit_hash,user_name,user_type,comment_author_assoc,comment_created_at
0,8bb5c66a7ed6aa42158a0186f322a472872da6b3,,,,
1,841a4b56ae11783c166bc9415d72b479991201c5,,,,
2,bfe61670944ab2a32f7b86daae12515de1ec617c,,,,
3,1ee4d1622ad7623d9c26502202b576ca121e47a5,,,,
4,710d9f248b50a70adbfbff5824b5710d6d315d7a,,,,
5,bcb12f090dd0d030cbcd783220c37ceeb4548e2b,,,,
6,eb0ab69565d9bbe01e5b7700b0d978bf1c60c47f,,,,
7,cb3b0748767d3bb4b8abae636c524ff38fb2eb26,,,,
8,12f05afeebb7c32366c75a481fb3ae693b1da95b,,,,
