In [3]:
import pydriller
import pandas as pd
from tqdm import tqdm
import re
import shutil
import jsonlines
import os
import json
import jsonlines

from langdetect import detect

In [5]:
df = pd.read_csv('results.csv', encoding='latin-1')
df.head()

Unnamed: 0,name,isFork,commits,branches,defaultBranch,releases,contributors,license,watchers,stargazers,...,homepage,mainLanguage,totalIssues,openIssues,totalPullRequests,openPullRequests,lastCommit,lastCommitSHA,hasWiki,isArchived
0,knadh/ml2en,False,23,1,master,0,3.0,,5,56,...,,Python,4.0,1.0,4.0,0.0,2021-12-10T08:10:03,dd706167d52f8c9f556ee15869e00749c71df677,True,False
1,er10yi/magicude,False,98,2,main,12,1.0,,12,423,...,,Python,9.0,2.0,2.0,0.0,2022-01-10T03:28:29,b20f98939e928847f323d903f358221380a1ea74,True,False
2,keguoyu/minijvm,False,11,3,main,0,0.0,,1,62,...,,Python,,,,,2021-02-20T07:13:03,4da519cadead7164de37fb9b8913360dd45f9193,True,False
3,nordnet/next-api-v2-examples,False,35,2,master,0,10.0,MIT License,17,32,...,https://www.nordnet.se/externalapi/docs,Python,0.0,0.0,10.0,0.0,2022-03-10T11:18:36,73ed85404f10d879d8e5959a88a19dfad3b41827,False,False
4,aubreyrjones/parasol,False,70,1,master,0,1.0,BSD 2-Clause Simplified License,3,18,...,,Python,0.0,0.0,0.0,0.0,2021-04-26T02:09:20,3110aec5b8824f10d324e35d02997b762f0436f4,True,False


In [6]:
df.shape

(152562, 25)

In [7]:
urls = df['name'].apply(lambda x: "https://github.com/" + x)
names = df['name'].apply(lambda x: x.split('/')[0])
urls[0], names[0]

('https://github.com/knadh/ml2en', 'knadh')

In [8]:
processed_repos = set()
with jsonlines.open("collected_info.jsonl") as reader: 
    for obj in reader: # for writing
        processed_repos.add(obj['rep_url'])

In [9]:
import os
if not os.path.exists('tmp_repos'):
    os.mkdir('tmp_repos')

In [10]:
def get_dir_size(path='.'):
    total = 0
    total_files = 0
    total_py_files = 0
    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
                total_files += 1
                if entry.name[-3:] == '.py':
                    total_py_files += 1
            elif entry.is_dir():
                a, b, c = get_dir_size(entry.path)
                total += a
                total_files += b
                total_py_files += c
    return total, total_files, total_py_files

In [11]:
def get_modules_from_py(file_text):
    return re.findall('^import.*', file_text, re.M) + re.findall('^from.*', file_text, re.M)

a = """import re
import numpy as np
np.random.seed(10)
from tqdm import tqdm
b=10
from math import log
print(log(b))
import os
os.path.append('.')

from sklearn.metrics import r2_score
import sys
import pandas as pd
"""
get_modules_from_py(a)

['import re',
 'import numpy as np',
 'import os',
 'import sys',
 'import pandas as pd',
 'from tqdm import tqdm',
 'from math import log',
 'from sklearn.metrics import r2_score']

In [12]:
assert len(urls) == len(names)

In [14]:
json_keys = [
    'rep_url', 'rep_size_on_disk', 'rep_total_files', 'rep_total_py_files', 'com_num_files',
    'com_msg', 'com_msg_lang', 'com_num_py_files', 'com_py2mod_lines', 'com_py2nloc', 
    'com_py2lines_after', 'com_py2imports', 'com_hash', 'com_author', 'com_committer', 
    'committer_date', 'com_branches', 'com_in_main_branch', 'com_merge', 'com_deletions',
    'com_insertions', 'com_lines', 'com_dmm_unit_size', 'com_dmm_unit_complexity',
    'com_dmm_unit_interfacing', 
]

In [15]:
pbar = tqdm(zip(urls, names), total=len(urls), position=0, leave=True)
with open('urls_logger', 'w') as f:
    pass

for url, name in pbar:
    with open('urls_logger', 'a') as f:
        f.write(url+'\n')
    if 
    print(url)
    if not os.path.exists(f'tmp_repos/{name}'):
        os.mkdir(f'tmp_repos/{name}')
    repo = pydriller.Repository(url, clone_repo_to=f'tmp_repos/{name}', only_modifications_with_file_types='.py',
                                num_workers=6)
    try:
        commits = list(repo.traverse_commits())
    except:
        shutil.rmtree(f'tmp_repos/{name}')
        continue
    
    try:
        repo_size, total_files, total_py_files = get_dir_size(f'tmp_repos/{name}')
        # if total_files > 60: # or repo_size * 10e-6 > 300: # or total_py_files / total_files < 0.2:
    except:
        repo_size, total_files, total_py_files = -1, -1, -1
    
    for i, commit in enumerate(commits):
        new_obj = {
            'rep_url': url,
            'rep_size_on_disk': repo_size,
            'rep_total_files': total_files,
            'rep_total_py_files': total_py_files,
        }
        
        msg = commit.msg if commit.msg else ''
        new_obj['com_msg'] = msg
        
        com_lang = ''
        try:
            com_lang = detect(commit.msg) != 'en'
        except:
            pass
        new_obj['com_msg_lang'] = com_lang
        new_obj['com_num_files'] = commit.files
        
        new_obj['com_hash'] = commit.hash
        new_obj['com_author'] = commit.author.name
        new_obj['com_committer'] = commit.committer.name
        new_obj['committer_date'] = str(commit.committer_date)
        new_obj['com_branches'] = list(commit.branches)
        new_obj['com_in_main_branch'] = commit.in_main_branch
        new_obj['com_merge'] = commit.merge
        new_obj['com_deletions'] = commit.deletions
        new_obj['com_insertions'] = commit.insertions
        new_obj['com_lines'] = commit.lines
        new_obj['com_dmm_unit_size'] = commit.dmm_unit_size
        new_obj['com_dmm_unit_complexity'] = commit.dmm_unit_complexity
        new_obj['com_dmm_unit_interfacing'] = commit.dmm_unit_interfacing
        
        len_mod_files = 0 
        num_py_files = 0  
        py2add_lines = {} 
        py2del_lines = {}  
        py2nloc = {} 
        py2lines_before = {}
        py2lines_after = {}
        py2imports_before = {}
        py2imports_after = {}
        py2paths = {} 
        py2ch_type = {} 
        mod_files = commit.modified_files
        for mod_id, mod_file in enumerate(mod_files):
            len_mod_files += 1
            if mod_file.filename[-3:] == '.py':
                num_py_files += 1
                file_name = f'{mod_id}_{mod_file.filename}'
                py2nloc[file_name] = mod_file.nloc
                py2add_lines[file_name] = mod_file.added_lines
                py2del_lines[file_name] = mod_file.deleted_lines
                py2paths[file_name] = [mod_file.old_path, mod_file.new_path]
                py2ch_type[file_name] = mod_file.change_type.name
                
                try:
                    code_before = mod_files[0].content_before.decode("utf-8") 
                except:
                    code_before = ''

                try:
                    code_after = mod_files[0].content.decode("utf-8")
                except:
                    code_after = ''
                
                lines_before = code_before.strip().count('\n') + 1
                lines_after = code_after.strip().count('\n') + 1
                imports_before = get_modules_from_py(code_before)
                imports_after = get_modules_from_py(code_after)
                
                py2lines_before[file_name] = lines_before
                py2lines_after[file_name] = lines_after
                py2imports_before[file_name] = imports_before
                py2imports_after[file_name] = imports_after
                
                
                
        
        
        new_obj['com_len_mod_files'] = len_mod_files
        new_obj['com_num_py_files'] = num_py_files
        new_obj['com_py2add_lines'] = py2add_lines
        new_obj['com_py2del_lines'] = py2del_lines
        new_obj['com_py2paths'] = py2paths
        new_obj['com_py2nloc'] = py2nloc
        new_obj['com_py2lines_after'] = py2lines_after
        new_obj['com_py2lines_before'] = py2lines_before
        new_obj['com_py2imports_before'] = py2imports_before
        new_obj['com_py2imports_after'] = py2imports_after
        new_obj['com_py2ch_type'] = py2ch_type
        
        with jsonlines.open("collected_info.jsonl", "a") as writer:   # for writing
            writer.write(new_obj)
    
    shutil.rmtree(f'tmp_repos/{name}')
    # pbar.set_postfix({'num_good': counter, 'num_good_no_filter': counter_no_fiters})

  0%|          | 0/152562 [00:00<?, ?it/s]

https://github.com/knadh/ml2en


  0%|          | 1/152562 [00:02<90:20:30,  2.13s/it]

https://github.com/er10yi/magicude


  0%|          | 2/152562 [00:07<126:42:17,  2.99s/it]

https://github.com/keguoyu/minijvm


  0%|          | 3/152562 [00:09<116:01:46,  2.74s/it]

https://github.com/nordnet/next-api-v2-examples


  0%|          | 4/152562 [00:10<101:09:32,  2.39s/it]

https://github.com/aubreyrjones/parasol


  0%|          | 5/152562 [00:18<170:06:41,  4.01s/it]

https://github.com/grassmunk/chicago95_extras


  0%|          | 6/152562 [00:23<177:10:34,  4.18s/it]

https://github.com/mjansson/mdns_lib


  0%|          | 7/152562 [00:27<172:13:46,  4.06s/it]

https://github.com/weixiao-huang/silver-spoon


  0%|          | 8/152562 [00:28<134:36:36,  3.18s/it]

https://github.com/xingshuo/python-kcp


  0%|          | 9/152562 [00:30<120:50:11,  2.85s/it]

https://github.com/friendlyarm/bakebit


KeyboardInterrupt: 

In [None]:
1