## Getting sample matricies for datasets

To save time and memory resources, get sample matrix ONCE and save it to *.txt file

In [12]:
import re
import math
import time
import numpy as np

from core.whose_cpp_code import get_filenames
import warnings

warnings.filterwarnings("ignore") 

from core.lexical_features import get_lexical_features
from core.cpp_keywords import count_cppkeywords_tf

import os.path


def get_features(filename, outfile):
    with open(filename, 'r') as in_file:
        with open(outfile, 'a') as output_file:
            lines = in_file.readlines()
            file_length = sum([len(line) for line in lines])

            inline_comments = 0 
            multiline_comments = 0
            newlines = 0 
            spaces = 0 
            tabs = 0 
            macros = 0
            open_brace_alone = 0 
            open_brace_first = 0 
            open_brace_last = 0
            closing_brace_alone = 0 
            closing_brace_first = 0 
            closing_brace_last = 0
            lines_of_code = 0

            if file_length != 0:
                avg_line_length = file_length / len(lines)

                for line in lines:

                    # comments
                    if re.match('(.*)//', line):
                        inline_comments += 1
                    if re.match('(.*)/\*', line):
                        multiline_comments += 1

                    # layout
                    newlines += line.count('\n')
                    spaces += line.count(' ')

                    if not (line.startswith('char') or line.startswith('string')):
                        tabs += line.count('\t') 
                    if re.search('#define .*.\(.*\)', line):
                        macros += 1

                    # bracing style
                    if re.match('^([\s\t\n]*)\{([\s\t\n]*)$', line):
                        open_brace_alone += 1
                    if re.match('^\{(.*?)', line):
                        open_brace_first += 1
                    if re.match('^(.*?)\{$', line):
                        open_brace_last += 1

                    if re.match('^([\s\t\n]*)\}([\s\t\n]*)$', line):
                        closing_brace_alone += 1
                    if re.match('^\}(.*?)', line):
                        closing_brace_first += 1
                    if re.match('^(.*?)\}$', line):
                        closing_brace_last += 1

                    if line.strip(' \n') != '':
                        lines_of_code += 1


                ln_open_brace_alone = math.log(
                    open_brace_alone / file_length) if open_brace_alone else 0
                ln_open_brace_first = math.log(
                    open_brace_first / file_length) if open_brace_first else 0
                ln_open_brace_last = math.log(
                    open_brace_last / file_length) if open_brace_last else 0
                ln_closing_brace_alone = math.log(
                    closing_brace_alone / file_length) if closing_brace_alone else 0
                ln_closing_brace_first = math.log(
                    closing_brace_first / file_length) if closing_brace_first else 0
                ln_closing_brace_last = math.log(
                    closing_brace_last / file_length) if closing_brace_last else 0


                comments = inline_comments + multiline_comments
                ln_comments = math.log(comments / file_length) if comments else 0
                ln_inline_comments = math.log(
                    inline_comments / file_length) if inline_comments else 0
                ln_multiline_comments = math.log(
                    multiline_comments / file_length) if multiline_comments else 0
                ln_spaces = math.log(spaces / file_length) if spaces else 0
                ln_tabs = math.log(tabs / file_length) if tabs else 0
                ln_newlines = math.log(newlines / file_length) if newlines else 0
                ln_macros = math.log(macros / file_length) if macros else 0    
                ln_comments = math.log(comments / file_length) if comments else 0
                whitespace_ratio = (spaces + tabs + newlines) / file_length

                print('Got lexical_features for ', filename)

                lexical_features = [ln_comments, ln_inline_comments, ln_multiline_comments,
                                    ln_macros, ln_spaces, ln_tabs, ln_newlines, whitespace_ratio,
                                    lines_of_code, avg_line_length,
                                    ln_open_brace_alone, ln_open_brace_first, ln_open_brace_last,
                                    ln_closing_brace_alone, ln_closing_brace_first, ln_closing_brace_last]
                
                output_file.write("%s\n" % lexical_features)
#                 return lexical_features
            else:
                pass
            

start_time = time.time()
path = '/media/marina/hdd/diploma/data/GoogleCodeJam/'
outpath = '/media/marina/hdd/diploma/whose_cpp_code/matricies/GoogleCodeJam/'

filenames, authors = get_filenames(path)
np.save(os.path.join(outpath, 'authors.npy'), authors)
np.save(os.path.join(outpath, 'filenames.npy'), authors)

# lexical_features = [get_lexical_features(filename) for filename in filenames]
# np.save(os.path.join(outpath, 'lexical_features.npy'), lexical_features)


# cpp_keywords_tf = count_cppkeywords_tf(filenames)
# np.save(os.path.join(outpath, 'cpp_keywords_tf.npy'), cpp_keywords_tf)


lexical_features = np.load(os.path.join(outpath, 'lexical_features.npy'))
cpp_keywords_tf = np.load(os.path.join(outpath, 'cpp_keywords_tf.npy'))
matrix = np.hstack((lexical_features, cpp_keywords_tf))
np.save(os.path.join(outpath, 'matrix.npy'), matrix)

print(len(np.load(os.path.join(outpath, 'matrix.npy'))))

run_time = time.time() - start_time
print('Run time: ', run_time)

278
Run time:  0.021021127700805664


In [1]:
from numpy import mean, var, std

# accuracy
x = [
        0.711672352444885,
        0.710810315102161,
        0.718099115953193,
        0.707362165731264,
        0.709077069806683,
        0.70694948827996,
        0.713363412934228,
        0.708653387623345,
        0.707824364476725,
        0.705638091045816, ]


m = mean(x)  # general average
v = var(x)  # dispersion
s = std(x)  # standard deviation

# confidence interval
print(m)
print(m - s, m + s)

0.70994497634
0.706437556788 0.713452395891
