In [1]:
import os
import git2net

#path to copy of virgin db:
sqlite_db_file = '/home/luc/pip/git2net/group_work_1/git2net_tutorial (Kopie).db'

#set variable to whatever folder the to-be-mined git repository is located:
repo_dir = '/home/luc/pip/git2net/ds_gw_1/TLP'

In [2]:
#uncomment this block, if mining is wished

#check, if database file is there; remove if yes
#if os.path.exists(sqlite_db_file):
#    os.remove(sqlite_db_file)
#    print('sqlite_db_file removed!')

#mining process: 
#repo_dir: to-be-mined git repository
#sqlite_db_file: to-be-stored values in database

#git2net.mine_git_repo(repo_dir, sqlite_db_file)

#git2net.mining_state_summary(repo_dir, sqlite_db_file)

In [3]:
import pandas as pd
import sqlite3

#connect to database file:
con = sqlite3.connect(sqlite_db_file)
c = con.cursor()


#create a new table with the desired column names:
query0="""
    CREATE TABLE IF NOT EXISTS df2(
    hash TEXT PRIMARY KEY,
    committer_name TEXT,
    committer_date TEXT,
    original_commit_deletion TEXT,
    levenshtein_dist INTEGER);
"""

#select only rows of table "edits", where edit_type='replacement', 
#merge this with table "commits" along the key 'hash',
#group this by unique hashes, taking the sum of the levenshtein dist of same hashes
#finally, insert this into a new table "df2" to avoid key words as column names:
query1="""
    INSERT INTO df2(hash, committer_name, committer_date, original_commit_deletion, levenshtein_dist)
    SELECT hash, committer_name, committer_date, original_commit_deletion, "SUM(levenshtein_dist)"
    FROM
    (
    SELECT 
    hash,
    committer_name,
    committer_date,
    original_commit_deletion,
    SUM(levenshtein_dist)
    FROM(SELECT * FROM edits INNER JOIN commits ON commits.hash = edits.commit_hash WHERE edit_type = 'replacement')
    GROUP BY hash
    );
"""

#count the distinct committer names:
query1_1="""
    SELECT count(DISTINCT committer_name) FROM df2;
"""

#select the distinct committer names
query1_2="""
    SELECT DISTINCT committer_name FROM df2;
"""


#execute the two queries on the database:
c.execute(query0)
c.execute(query1)
c.execute(query1_1)

#store amount of distinct committer names as int...
number_committers = c.fetchone()[0]

#...before executing the next query:
c.execute(query1_2)

#store names of committers in list:
committer_list = c.fetchall()


#save the queries:
con.commit()

#close the database to make it accessible for others:
con.close()



In [4]:
#create new empty table:
query01="""
    CREATE TABLE IF NOT EXISTS df3(
    hash TEXT PRIMARY KEY,
    committer_name TEXT,
    committer_date TEXT,
    original_commit_deletion TEXT,
    levenshtein_dist INTEGER);
"""

#sort committer_date ascending:
query2="""
    INSERT INTO df3(hash, committer_name, committer_date, original_commit_deletion, levenshtein_dist)
    SELECT *
    FROM
    (
    SELECT * 
    FROM df2 
    ORDER BY committer_date ASC
    );
"""

#delete table df2:
query3="""
    DROP TABLE df2;
"""

#delete table df3:
#query4="""
#    DROP TABLE df3;
#"""


#ordering:
#query0, query2_edited, query3
#then:
#query01, query2_edited, query4


con = sqlite3.connect(sqlite_db_file)
c = con.cursor()

#execute the queries:
c.execute(query01)
c.execute(query2)
c.execute(query3)
#result: df3

print(c.fetchmany(5))

con.commit()
con.close()

[]


In [5]:
con = sqlite3.connect(sqlite_db_file)

#output df2 as a pandas dataframe
df = pd.read_sql_query("SELECT * FROM df3;", con=con)

#turn Pandas dataframe into numpy array:
df = df.values

In [6]:
import numpy as np

#function for adding a full-length '0'-column to the dataframe:
def add_column(df):
    new_column = np.array(np.zeros(df.shape[0]))
    return np.column_stack((df, new_column))

In [7]:
#add column:
df = add_column(df)

In [8]:
#make date stamps readible, by turning them into a datetime-object:

from datetime import datetime

def date_to_int(date_str):
    return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')

print(date_to_int(df[0][2]))

2010-01-22 18:16:44


In [9]:
#calculate amount of time (in seconds) since last commit for every authors commit; save values in the newly created column

for i in range(df.shape[0]-1):
    #grab the author and his commit_time:
    author = df[i][1]
    commit_time = df[i][2]
    for j in range(i+1,df.shape[0]):
        #search for the his next commit:
        author_next = df[j][1]
        commit_time_next = df[j][2]
        #if there is a "next commit", calculate the commit_time differences (in total seconds) and save the value in the last column:
        if author == author_next:
                df[j][5] = (date_to_int(commit_time_next) - date_to_int(commit_time)).total_seconds()
                break
            

In [10]:
#add empty 6th column:
df = add_column(df)
print(df.shape)

(942, 7)


In [11]:
#calculate a measure for productivity:
#levenshtein_dist / time_between_commits = levenshtein_dist per second

for i in range(df.shape[0]):
        #if time_between_commits = 0, then set productivity to 0
    if df[i][5] == 0:
        df[i][6] = 0
        #else, calculate productivity measure and store in 6th column
    else:     
        df[i][6] = df[i][4] / df[i][5]

In [12]:
#add empty 7th column:
df = add_column(df)

In [13]:
#find out, if author collaborated or not:
#'1' if his parents hash is another authors commit hash,
#'0' if not
#add this boolean in a separate column:

for i in range(df.shape[0]):
    parent_hash = df[i][3]
    #set orig_author to default 'nobody'
    #(important for genesis commit hashes)
    orig_author = 'nobody'
    for j in range(i):
        #check whether any other prior hash is from the same author
        if df[j][0] == parent_hash:
            #update orig_author with real author name:
            orig_author = df[j][1]
            #if original hash found, then break:
            break
    #set a place holder in the 7th column:
    df[i][7] = 'no parent_hash'
    #if author collaborated, save '1', if not, save '0':
    #only iff orig_author was another author, did the author collaborate:
    if orig_author != df[i][1] and orig_author != 'nobody':
        df[i][7] = 1
    else:
        df[i][7] = 0

In [14]:
#delete commit_hash:
df = np.delete(df,0,1)
#delete commit_time:
df = np.delete(df,1,1)
#delete parent_hash:
df = np.delete(df,1,1)
#delete levensthein_dist:
df = np.delete(df,1,1)
#delete time_between_commits:
df = np.delete(df,1,1)

In [15]:
#turn numpy array into pandas dataframe:
#insert first column of df as 'committer_names':
pdf = pd.DataFrame({'committer_names': df[:,0]})
#insert second column of df as 'productivity_per_second':
pdf['productivity_per_second'] = df[:,1]
#insert third column of df as 'collab_bool':
pdf['collab_bool'] = df[:,2]

print(pdf.shape)

(942, 3)


In [16]:
import statsmodels.formula.api as sm

In [19]:
pdf['productivity_per_second'].shape

(942,)

In [17]:
pdf['productivity_per_second'].shape

ValueError: shapes (942,922) and (942,922) not aligned: 922 (dim 1) != 942 (dim 0)

In [None]:
model = sm.ols(formula='productivity_per_second ~ collab_bool + committer_names', data = pdf)
fit = model.fit()

print(fit.summary())

In [14]:
model = sm.ols(formula='productivity_per_second ~ collab_bool + committer_names', data = pdf)
fit = model.fit()

print(fit.summary())

(942, 19)


In [15]:
#delete all the other columns that are not needed for the regression analysis:
#from df, delete row/column number 0, with axis column ('1'): 
df = np.delete(df,0,1)
df = np.delete(df,0,1)
df = np.delete(df,0,1)
df = np.delete(df,0,1)
df = np.delete(df,0,1)
df = np.delete(df,0,1)

In [16]:
#turn numpy array into pandas dataframe:
#insert first column of df as 'productivity_per_second':
pdf = pd.DataFrame({'productivity_per_second': df[:,0]})
#insert second column of df as 'collab_bool':
pdf['collab_bool'] = df[:,1]


for i in range(number_committers):
    #insert the dummy variable columns, 
    #take the committer_name as column name
    pdf[committer_list[i][0]] = df[:,i+2]

print(pdf.shape)

(942, 13)


In [17]:
#turn list of string-tuples into list of single-strings:
X = [i[0] for i in committer_list]
#insert the string 'collab_bool' at the first spot of X:
X.insert(0,'collab_bool')
X

['collab_bool',
 'Thomas Koch',
 'GitHub',
 'SammysHP',
 'André Erdmann',
 'Aaditya Bagga',
 'Timofey Titovets',
 'Connor Prussin',
 'TK',
 'Kai-Heng Feng',
 'Maxime Gauduin',
 'Qiang Yu']

In [18]:
#perform multiple linear regression:

#load the sklearn modules:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()

#fit the model:
#Independent variables: all the columns with name of the different committers
#Dependent variable: the productivity_per_second column 
mlr.fit(pdf[X], pdf[['productivity_per_second']])

#Print intercept:
print(mlr.intercept_)

#Print all coefficients:
print(mlr.coef_)

[-3.79310801e+12]
[[-1.07474422e+00  3.79310801e+12  3.79310801e+12  3.79310801e+12
   3.79310801e+12  3.79310801e+12  3.79310801e+12  3.79310801e+12
   3.79310801e+12  3.79310801e+12  3.79310801e+12  3.79310801e+12]]


In [19]:
#try a prediction:
#what is the productivity_per_second for thomas_koch, who is not collaborating? :
koch_not_collab = np.array([[0,1,0,0,0,0,0,0,0,0,0,0]])

mlr.predict(koch_not_collab)

array([[0.30371094]])

In [20]:
#Perform Robust Regression:

from sklearn import linear_model
from sklearn import metrics


model = linear_model.RANSACRegressor(min_samples=10, residual_threshold=5, max_trials=100)
model.fit(pdf[X], pdf[['productivity_per_second']])

print("Coefficients = ", model.estimator_.coef_)

Coefficients =  [[-3.76887392e-02 -3.67486755e+12 -3.67486755e+12 -3.67486755e+12
  -3.67486755e+12 -3.67486755e+12 -3.67486755e+12 -3.67486755e+12
  -3.67486755e+12 -3.67486755e+12 -3.67486755e+12 -3.67486755e+12]]


In [21]:
model.predict(koch_not_collab)

array([[0.04199219]])

In [22]:
import statsmodels.formula.api as sm

model = sm.ols(formula='productivity_per_second ~ pdf[X]', data = pdf)
fit = model.fit()
print(fit.summary())

PatsyError: categorical data cannot be >1-dimensional