# The below code is proposed as an exemplary solution for the Microtask - 1 
## The program was built based on GrimoireLab Tutorial  
### developed and tested by Jacek Dąbrowski

In [29]:
# Importing modules and libraries
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from pprint import pprint
import pandas as pd
import subprocess

In [4]:
# Instate ES and indicate repot to be used for further analysis, perceval.git selected for commit analysis
es = Elasticsearch('http://localhost:9200')
git_repo = 'https://github.com/grimoirelab/perceval.git'

In [None]:
# Running the p2o.py to extract git_data and populate index and enriched_index
subprocess.run(['p2o.py', '--enrich', '--index', 'git_raw',
      '--index-enrich', 'git', '-e', 'http://localhost:9200',
      '--no_inc', '--debug', 'git', git_repo])

In [5]:
# Building a query definition
# Building buckets of commits grouped by author name and aggregated as first commit for each of these authorse
s = Search(using=es, index='git')
s.aggs.bucket('by_authors', 'terms', field='author_name', size=10000).metric('first_commit', 'min', field='author_date')
s = s.sort("author_date")

In [6]:
# Execute the query
result = s.execute()

In [7]:
# Selecting fields we are interested in from dictionary, meaning authour, first commit and date with time
buckets_result = result['aggregations']['by_authors']['buckets']
# Converting query results to a list
buckets = []
for bucket in buckets_result:
    first_commit = bucket['first_commit']['value']/1000
    buckets.append(
        {'first_commit': datetime.utcfromtimestamp(first_commit),
        'author': bucket['key'],
        'number_of_commits' : bucket['doc_count']}
        )

In [8]:
# Converting a list with results to dataframe - two dimensional table
authors = pd.DataFrame.from_records(buckets)
authors.sort_values(by='first_commit', ascending=False, inplace=True)

In [9]:
# Create table with authors and their number of commits / remove data column
authors_commits = authors.loc[:,['author', 'number_of_commits']]
pprint(authors_commits)

                            author  number_of_commits
7           Miguel Ángel Fernández                  3
12                  Israel Herraiz                  1
16                           david                  1
11            David Pose Fernández                  1
10                     David Esler                  1
9                          valerio                  2
1                Valerio Cosentino                142
5                valerio cosentino                  6
15                   Stephan Barth                  1
14                 Luis Cañas Díaz                  1
6                             quan                  5
8                         camillem                  2
13  J. Manrique Lopez de la Fuente                  1
2                   Alberto Martín                 51
4       Jesus M. Gonzalez-Barahona                 18
3              Alvaro del Castillo                 45
0                  Santiago Dueñas                704


In [10]:
# Calculating the number of new commiters each given months
by_month = authors['first_commit'].groupby([authors.first_commit.dt.year, authors.first_commit.dt.month]).agg('count')

In [11]:
# The first columnt denotes year, months, and finally number of commits 
pprint(by_month)

first_commit  first_commit
2015          8               1
              12              2
2016          2               1
              3               2
              4               1
              9               1
2017          1               1
              9               2
              10              2
              11              1
              12              1
2018          1               1
              2               1
Name: first_commit, dtype: int64


In [39]:
# Save results to CSV files
by_month.to_csv('authors_per_month.csv')
authors.to_csv('authors_first.csv', columns=['first_commit', 'author'],index=False)