# Exploring Data
This notebook is aimed to explore cargo repo data

In [2]:
import pandas
import matplotlib
import seaborn
import collections
import itertools

import sys
sys.path.append('..')

from helper import load_data , load_repo

%matplotlib inline

# Load Cargo

In [3]:
cargo = load_repo('Cargo')

In [4]:
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')]
cargo_git = cargo_git.drop_duplicates(subset='Repository_URL')

# Load Dependencies

In [5]:
packages, dependencies = load_data('Cargo')

In [6]:
packages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82335 entries, 0 to 82336
Data columns (total 3 columns):
package    82335 non-null object
version    82335 non-null object
date       82335 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 2.5+ MB


All unique cargo packages and adding a column as package-version combination

In [7]:
unique_cargo_packages = packages.drop_duplicates(subset='package',keep = 'last')
unique_cargo_packages['pack_ver'] = unique_cargo_packages['package']+unique_cargo_packages['version']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


All dependencies

In [8]:
dependencies['pack_ver'] = dependencies['package']+dependencies['version']
all_unique_dependencies = dependencies[dependencies['pack_ver'].isin(unique_cargo_packages['pack_ver'])]
all_unique_dependencies = all_unique_dependencies.drop('pack_ver',axis=1).reset_index().drop('index',axis=1)

In [9]:
all_unique_dependencies.head(10)

Unnamed: 0,package,version,target,constraint
0,acacia,0.1.2,itertools,^0.5
1,acacia,0.1.2,num,^0.1
2,acacia,0.1.2,rand,^0.3
3,aio,0.0.1,event,*
4,aio,0.0.1,mio,*
5,aio,0.0.1,nix,*
6,advapi32-sys,0.2.0,winapi,^0.2.5
7,alfred,4.0.1,serde_json,^1.0
8,algebloat,0.0.8,algebloat_macros,= 0.0.8
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8


# Related Git URLs
finding git url that each package contributor communicates with 

In [11]:
repos = cargo_git[['Name','Repository_URL']].set_index('Name')
repos.head()

Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
acacia,https://github.com/aepsil0n/acacia
abort_on_panic,https://github.com/emk/abort_on_panic-rs
aio,https://github.com/reem/rust-aio
advapi32-sys,https://github.com/retep998/winapi-rs
alfred,https://github.com/kballard/alfred-rs


In [45]:
ser = all_unique_dependencies['target']
all_unique_dependencies['target_url'] = repos.loc[all_unique_dependencies['target']].reset_index()[['Repository_URL']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [48]:
all_unique_dependencies.head(30)


Unnamed: 0,package,version,target,constraint,target_url
0,acacia,0.1.2,itertools,^0.5,https://github.com/bluss/rust-itertools
1,acacia,0.1.2,num,^0.1,https://github.com/rust-num/num
2,acacia,0.1.2,rand,^0.3,https://github.com/rust-lang-nursery/rand
3,aio,0.0.1,event,*,https://github.com/reem/rust-event
4,aio,0.0.1,mio,*,https://github.com/carllerche/mio
5,aio,0.0.1,nix,*,https://github.com/nix-rust/nix
6,advapi32-sys,0.2.0,winapi,^0.2.5,
7,alfred,4.0.1,serde_json,^1.0,https://github.com/serde-rs/json
8,algebloat,0.0.8,algebloat_macros,= 0.0.8,
9,allegro_acodec,0.0.8,allegro_acodec-sys,= 0.0.8,


Unnamed: 0_level_0,Repository_URL
Name,Unnamed: 1_level_1
