In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

# GitHub scrapping

Form for scrapping GitHub data

In [2]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload


from ipywidgets import Text, IntSlider, Box, Layout, Label, Dropdown, Textarea, Button
from IPython.display import display, HTML, clear_output

from core.github_scrapper import scrap

userlist = ['paroj', 'asmorkalov', 'edgarriba', 'mariarti91', 'GoodRon', 
            'Mooophy', 'pezy', 'Soyn', 'sipa', 'gavinandresen',
            'theuni', 'luke-jr', 'ddunbar', 'redboltz', 'frsyuki',
            'tanakh', 'danmar', 'orbitcowboy', 'XhmikosR', 'zblair',
            'ghewgill', 'singku', 'eriklax', 'HeisSpiter', 'JIghtuse',
            'rpavlik', 'gavofyork', 'chfast', 'debris', 'lballabio']

form_layout = Layout(
    display='flex',
    flex_flow='column',
    align_items='stretch',
    width='50%'
)

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)


username = Text()
password = Text()


# Text(value='Username')


scrap_form_items = [
    Box([Label(value='Username:'), username], layout=form_item_layout),
    Box([Label(value='Password:'), password], layout=form_item_layout),

]

def scrap_github(b):
    clear_output()
    if username.value and password.value:
        print('Getting files, please, wait...')
        scrap(userlist, username.value, password.value)
        print('Done')
    else:
        print('Please, enter data')

scrap_form = Box(scrap_form_items, layout=form_layout)

scrap_button = Button(description='Scrap GitHub', tooltip='Click me', icon='check', button_style='success')
scrap_button.on_click(scrap_github)

display(scrap_form, scrap_button)

## Lexical features

In [3]:
## Syntactic features

## Cpp keywords

- **module**: cpp_keywords.py
- **input**: filenames (numpy array)
- **output**: keywords frequencies (numpy array)

### Ключевые слова языка C/C++

Могут быть найдены по ссылке в [официальной документации](http://ru.cppreference.com/w/cpp/keyword "cppreference.com")

Частоты ключевых слов языка C/C++:

| float | class | while | namespace | .. |
-----| --- | -------| --- | --- | --- |
file_1| 0.353 | 0.000 | 0.000 | 0.707 | ... |
file_2| 0.401 | 0.213 | 0.000 | 0.708 | ... |
... | ... | ... | ... | ... | ... |
file_n | 0.303 | 0.000 | 0.678 | 0.807 | ... |


### widgets

In [4]:
from ipywidgets import interact

def f(x):
    return x

interact(f, x=10);

interact(f, x=True);


@interact(x=True, y=1.0)
def g(x, y):
    return (x, y)

### main functions

In [5]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload

from ipywidgets import IntSlider, Box, Layout, Label, Dropdown, Button
from IPython.display import display, HTML, clear_output
from core.whose_cpp_code import classify_authors
from numpy import mean, std
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")   


def get_confidence(array):
    alpha = 0.95
    m = mean(array)  # general average
    sigma = std(array)  # standard deviation 

    # confidence interval
    print('mean accuracy: ', m)
    print('standart deviation: ', sigma)
    print('confidence interval: (', m - alpha * sigma, ';', m + alpha * sigma, ')')


form_layout = Layout(
    display='flex',
    flex_flow='column',
    align_items='stretch',
    width='50%'
)

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)


# loops = IntSlider(min=1, max=10)
data = Dropdown(options={'students' : '/media/marina/hdd/diploma/data/c++/data/', 
                           'GoogleCodeJam' : '/media/marina/hdd/diploma/data/c++/data3/', 
                           'GitHub' : './data'})
classifier =  Dropdown(options={'RandomForest' : 'RandomForestClassifier', 
                                'GradientBoosting' : 'GradientBoostingClassifier',
                                'ExtraTrees' : 'ExtraTreesClassifier',
                                'AdaBoost' : 'AdaBoostClassifier'})

In [6]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

def make_metrics_bar(metrics, loops_num):
    
    trace0 = go.Bar(
        x=list(range(1, loops_num+1)),
        y=metrics['f1_score'],
        name='F1-score',
        marker=dict(
            color='rgb(136, 142, 150)'
        )
    )
    trace1 = go.Bar(
        x=list(range(1, loops_num+1)),
        y=metrics['precision'],
        name='Precision',
        marker=dict(
            color='rgb(204,204,204)',
        )
    )
    trace2 = go.Bar(
        x=list(range(1, loops_num+1)),
        y=metrics['recall'],
        name='Recall',
        marker=dict(
            color='rgb(144, 177, 229)',
        )
    )
    trace3 = go.Bar(
        x=list(range(1, loops_num+1)),
        y=metrics['accuracy'],
        name='Accuracy',
        marker=dict(
            color='rgb(49,130,189)',
        )
    )

    data = [trace0, trace1, trace2, trace3]
    layout = go.Layout(
        xaxis=dict(
            tickangle=-45,
            title='Number of experiment',
            titlefont=dict(
                size=16,
                color='rgb(107, 107, 107)'
            ),
            tickfont=dict(
                size=14,
                color='rgb(107, 107, 107)'
            )
        ),
        yaxis=dict(
            title='Value, %',
            titlefont=dict(
                size=16,
                color='rgb(107, 107, 107)'
            ),
            tickfont=dict(
                size=14,
                color='rgb(107, 107, 107)'
            )
        ),
        barmode='group',
        title='Classification metrics',
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='metrics-bar')
    

def make_pie(mean_accuracy):
    fig = {
      "data": [
        {
          "values": [1-mean_accuracy, mean_accuracy],
          "labels": ['Wrong predicted samples, %', 'True predictes samples, %'],
          "type": "pie",
          "text": "Accuracy",
          "textposition":"inside",
          "hole": .4,
    #       "domain": {"x": [.52, 1]},
        }],
      "layout": {
            "title": 'Total mean accuracy',

            "annotations": [
                {
                    "font": {
                        "size": 20
                    },
                    "showarrow": False,
                    "text": "Accuracy",
                }
            ]
        }
    }
    iplot(fig)

In [7]:
def classify(b):
    clear_output()
    print('Please, wait...')
    report = classify_authors(data.value, classifier.value)
    df = pd.DataFrame(report)
    df.to_csv('output.csv', mode='a')
    display(HTML(df.to_html()))
    print('Done')

table_form_items = [
    Box([Label(value='Data:'), data], layout=form_item_layout),
    Box([Label(value='Classifier:'), classifier], layout=form_item_layout),

]

table_form = Box(table_form_items, layout=form_layout)

classify_button = Button(description='Classify', tooltip='Click me', icon='check', button_style='success')
classify_button.on_click(classify)

display(table_form, classify_button)

In [8]:
import numpy as np
import time

loops = IntSlider(min=1, max=10)

form_items = [
    Box([Label(value='Loops:'), loops], layout=form_item_layout),
    Box([Label(value='Data:'), data], layout=form_item_layout),
    Box([Label(value='Classifier:'), classifier], layout=form_item_layout),
]


def classify_mul(b):
    clear_output()
    accuracy, precision, recall, f1_score = [],[],[],[]
    start_time = time.time()
    for loop in range(loops.value):
        print('Loop ', loop + 1, ': Please, wait...')
        report = classify_authors(data.value, classifier.value)
        df = pd.DataFrame(report)
        accuracy.append(mean(df['accuracy'].tolist()))
        precision.append(mean(df['precision'].tolist()))
        recall.append(mean(df['recall'].tolist()))
        f1_score.append(mean(df['f1_score'].tolist()))

    run_time = round(time.time() - start_time, 2)
    print('Run time in sec: ', run_time)
    
    metrics = {'f1_score' : f1_score, 
               'precision' : precision, 
               'recall': recall,
               'accuracy' : accuracy
              }
    make_metrics_bar(metrics, loops.value)

    mean_accuracy = mean(accuracy)
    make_pie(mean_accuracy)
    
    # saving results to csv
    metrics.update({'classifier' : classifier.value})
    result_df = pd.DataFrame(metrics)
    result_df.to_csv('results.csv', mode='w')

   

form = Box(form_items, layout=form_layout)
classify_mul_btn = Button(description='Classify', tooltip='Click me', icon='check', button_style='success')
classify_mul_btn.on_click(classify_mul)

## main module

In [9]:
display(form, classify_mul_btn)

Loop  1 : Please, wait...
Loop  2 : Please, wait...
Loop  3 : Please, wait...
Loop  4 : Please, wait...
Loop  5 : Please, wait...
Loop  6 : Please, wait...
Loop  7 : Please, wait...
Loop  8 : Please, wait...
Loop  9 : Please, wait...
Loop  10 : Please, wait...
Run time in sec:  54.0
