In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
# import logging.config
import yaml
# import utils
from pygount import SourceAnalysis
from tree_sitter import Language, Parser

##Open the config file
with open("config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

# logging.config.fileConfig(fname="./logging.conf", disable_existing_loggers=False)
# logging = logging.getLogger("clone")

build_file_path = 'C:/Users/vikrant.sahu/Documents/Python_Scripts_Projects/GIT Analytics/code_parser/build/my-languages.so'
##Function to parse any file
def parse_file(path, file_name):

    file = ''.join([path, '/', file_name])  ##merging path with file name
    PARSER = Parser()

    if file_name.endswith(".py"):
        ##Define the parser
        language = "python"
        PARSER.set_language(Language(build_file_path, language))
        import parsers.python_parser as parser_wrapper
    elif file_name.endswith(".js"):
        ##Define the parser
        language = "javascript"
        PARSER.set_language(Language(build_file_path, language))
        import parsers.javascript_parser as parser_wrapper
    elif file_name.endswith(".java"):
        ##Define the parser
        language = "java"
        PARSER.set_language(Language(build_file_path, language))
        import parsers.java_parser as parser_wrapper

    else:
        file_parse_error = np.NaN
        class_count = pd.NA
        function_count = pd.NA
        temp_func_details = []
        return file_parse_error, class_count, function_count, temp_func_details

    # parse the whole file
    with open(file, encoding="utf8") as source_code:
        blob = source_code.read()
    tree = PARSER.parse(blob.encode())
    # get the class and function details
    try:
        temp_func_details = parser_wrapper.get_definition(tree, blob)
        temp_func_details = pd.DataFrame(temp_func_details)

        # Assign File name and Path
        temp_func_details["file_name"] = file_name
        temp_func_details["file_path"] = path

        # Get function body
        temp_func_details["function_body"] = ""
        for index, row in temp_func_details[temp_func_details.func_name.notnull()].iterrows():
            with open(file, encoding="utf8") as fd:
                temp_func_details.at[index, "function_body"] = [line for i, line in enumerate(fd) if
                                                 i in range(row.start_point[0] - 1, row.end_point[0])]

        # Count Classes and functions
        class_count = temp_func_details[temp_func_details.func_name.isnull()].shape[0]
        function_count = temp_func_details[temp_func_details.class_name.isnull()].shape[0]

        file_parse_error = np.NaN
    except Exception as e:
        logging.error(e)
        file_parse_error = e
        class_count = pd.NA
        function_count = pd.NA
        temp_func_details = []

    return file_parse_error, class_count, function_count, temp_func_details

###Fuction to parse the repo - called during installation
def pasrse_repo(repo_id, dir_path):
    func_class_details = pd.DataFrame()
    file_details = pd.DataFrame()

    for path, subdirs, files in os.walk(dir_path):

        path = path.replace("\\", "/")
        ##ignore if it's the .git folder as it doesn't contains any code for parsing
        if ".git" in path.split("/"): continue

        for name in files:
            file = ''.join([path, '/', name])  ##merging path with file name
            file_size = Path(file).stat().st_size  ##Bytes
            ##Get file extension
            fn, file_extension = os.path.splitext(file)
            ##get code and comments
            code_metrics = (SourceAnalysis.from_file(file, ""))

            file_parse_error, class_count, function_count, temp_func_details = parse_file(path=path
                                                                                          , file_name=name)
            func_class_details = func_class_details.append(temp_func_details)

            ##file details meta
            file_details = file_details.append(
                pd.DataFrame({"file_path": path, "file_name": name, "file_size": file_size
                                 , "file_extension": file_extension
                                 , "total_classes": class_count
                                 , "total_functions": function_count
                                 , "language": code_metrics.language
                                 , "lines_of_comments": [code_metrics.documentation_count]
                                 , "lines_of_code": [code_metrics.code_count]
                                 , "error": file_parse_error
                              }, index=[0]))

    logging.info(f'Repository parsing over for repo: {repo_id}')
    logging.info(f'file_details_size: {file_details.shape[0]}, func_class_details_size: {func_class_details.shape[0]}')

    ##assign repo_id
    func_class_details["repo_id"] = repo_id
    file_details["repo_id"] = repo_id

    ##Mongo
    db = utils.open_con(cfg)

    ##write to the DB
    utils.push_dataframe(db=db, collection="func_class_details", df=func_class_details)
    utils.push_dataframe(db=db, collection="file_details", df=file_details)


##Function to parse a list of files
def parse_files(repo_id, file_list):
    func_class_details = pd.DataFrame()
    file_details = pd.DataFrame()
    
    for file in file_list:
        path, name = os.path.split(file)  ##split it into path and file name
        file_size = Path(file).stat().st_size  ##Bytes
        ##Get file extension
        fn, file_extension = os.path.splitext(file)
        ##get code and comments
        code_metrics = (SourceAnalysis.from_file(file, ""))

        file_parse_error, class_count, function_count, temp_func_details = parse_file(path=path
                                                                                      , file_name=name)
        func_class_details = func_class_details.append(temp_func_details)

        ##file details meta
        file_details = file_details.append(
            pd.DataFrame({"file_path": path, "file_name": name, "file_size": file_size
                             , "file_extension": file_extension
                             , "total_classes": class_count
                             , "total_functions": function_count
                             , "language": code_metrics.language
                             , "lines_of_comments": [code_metrics.documentation_count]
                             , "lines_of_code": [code_metrics.code_count]
                             , "error": file_parse_error
                          }, index=[0]))

    logging.info(f'File parsing over for repo: {repo_id}')

    ##assign repo_id
    func_class_details["repo_id"] = repo_id
    file_details["repo_id"] = repo_id

    ##Mongo
    db = utils.open_con(cfg)

    ##Delete records
    for file in file_list:
        path, name = os.path.split(file)  ##split it into path and file name
        utils.delete_record(db=db, collection_name="func_class_details"
                            , query={"repo_id": repo_id, "file_path": path, "file_name": name})
        utils.delete_record(db=db, collection_name="file_details"
                            , query={"repo_id": repo_id, "file_path": path, "file_name": name})

    ##write to the DB
    utils.push_dataframe(db=db, collection="func_class_details", df=func_class_details)
    utils.push_dataframe(db=db, collection="file_details", df=file_details)


### Test parser

In [2]:
dir_path = "D:\other_use_cases\git_commit_analysis\parsing_sample_files"

In [3]:
func_class_details = pd.DataFrame()
file_details = pd.DataFrame()

for path, subdirs, files in os.walk(dir_path):

    path = path.replace("\\", "/")
    ##ignore if it's the .git folder as it doesn't contains any code for parsing
    if ".git" in path.split("/"): continue

    for name in files:
        file = ''.join([path, '/', name])  ##merging path with file name
        file_size = Path(file).stat().st_size  ##Bytes
        ##Get file extension
        fn, file_extension = os.path.splitext(file)
        ##get code and comments
        code_metrics = (SourceAnalysis.from_file(file, ""))

        file_parse_error, class_count, function_count, temp_func_details = parse_file(path=path
                                                                                      , file_name=name)
        func_class_details = func_class_details.append(temp_func_details)

        ##file details meta
        file_details = file_details.append(
            pd.DataFrame({"file_path": path, "file_name": name, "file_size": file_size
                             , "file_extension": file_extension
                             , "total_classes": class_count
                             , "total_functions": function_count
                             , "language": code_metrics.language
                             , "lines_of_comments": [code_metrics.documentation_count]
                             , "lines_of_code": [code_metrics.code_count]
                             , "error": file_parse_error
                          }, index=[0]))

In [4]:
func_class_details

Unnamed: 0,func_name,parameters,function,function_tokens,return_statement,argument_list,docstring,docstring_summary,start_point,end_point,loc,class_name,file_name,file_path,function_body
0,userName,(),"(function () {\nvar userName = ""Steve"";\n\nfun...","[(, function, (, ), {, var, userName, =, ""Stev...",,,,,"(18, 0)","(27, 5)",9,,javaScript.js,D:/other_use_cases/git_commit_analysis/parsing...,"[\n, (function () {\n, var userName = ""Steve"";..."
1,LeftPanelView,(props),class LeftPanelView extends React.Component {\...,"[class, LeftPanelView, extends, React, ., Comp...",,,,,"(29, 0)","(191, 1)",162,,javaScript.js,D:/other_use_cases/git_commit_analysis/parsing...,"[\n, class LeftPanelView extends React.Compone..."
2,getGamificationObject,"(weeklySummary, monthlySummary)",export const getGamificationObject = (weeklySu...,"[export, const, getGamificationObject, =, (, w...",,,,,"(195, 0)","(203, 1)",8,,javaScript.js,D:/other_use_cases/git_commit_analysis/parsing...,"[\n, export const getGamificationObject = (wee..."
3,analyzeMonthlySummary,"(monthlySummary, weeklySummary, gamificationOb...","const analyzeMonthlySummary = (monthlySummary,...","[const, analyzeMonthlySummary, =, (, monthlySu...",,,,,"(205, 0)","(311, 1)",106,,javaScript.js,D:/other_use_cases/git_commit_analysis/parsing...,"[\n, const analyzeMonthlySummary = (monthlySum..."
4,analyzeWeeklySummary,"(weeklySummary, gamificationObject)","const analyzeWeeklySummary = (weeklySummary, g...","[const, analyzeWeeklySummary, =, (, weeklySumm...",,,,,"(313, 0)","(362, 1)",49,,javaScript.js,D:/other_use_cases/git_commit_analysis/parsing...,"[\n, const analyzeWeeklySummary = (weeklySumma..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,get_definition,"(tree, blob: str)","def get_definition(tree, blob: str) -> List[Di...",,,,,,"(111, 4)","(112, 12)",1,LanguageParser_2,python_file.py,D:/other_use_cases/git_commit_analysis/parsing...,"[ @abstractmethod\n, def get_definition..."
16,get_class_metadata,"(class_node, blob)","def get_class_metadata(class_node, blob):\n ...",,,,,,"(116, 4)","(117, 12)",1,LanguageParser_2,python_file.py,D:/other_use_cases/git_commit_analysis/parsing...,"[ @abstractmethod\n, def get_class_meta..."
17,get_function_metadata,"(function_node, blob)","def get_function_metadata(function_node, blob)...",,,,,,"(121, 4)","(122, 12)",1,LanguageParser_2,python_file.py,D:/other_use_cases/git_commit_analysis/parsing...,"[ @abstractmethod\n, def get_function_m..."
18,get_context,"(tree, blob)","def get_context(tree, blob):\n raise No...",,,,,,"(126, 4)","(127, 33)",1,LanguageParser_2,python_file.py,D:/other_use_cases/git_commit_analysis/parsing...,"[ @abstractmethod\n, def get_context(tr..."


In [5]:
file_details

Unnamed: 0,file_path,file_name,file_size,file_extension,total_classes,total_functions,language,lines_of_comments,lines_of_code,error
0,D:/other_use_cases/git_commit_analysis/parsing...,javaScript.js,28265,.js,0,16,JavaScript+Genshi Text,7,460,
0,D:/other_use_cases/git_commit_analysis/parsing...,java_file.java,46779,.java,1,0,Java,96,832,
0,D:/other_use_cases/git_commit_analysis/parsing...,python_file.py,3952,.py,2,8,Python,6,95,


## testing

In [32]:
cfg.get("build_file_path")

'../build/my-languages.so'

In [37]:
language = "python"
build_file_path = 'build/my-languages.so' #cfg.get("build_file_path")
# 'C:/Users/vikrant.sahu/Documents/Python_Scripts_Projects/GIT Analytics/code_parser/build/my-languages.so'
PARSER = Parser()
PARSER.set_language(Language(build_file_path, language))

OSError: [WinError 126] The specified module could not be found

In [4]:
import requests

In [8]:
x = requests.post("http://localhost:5005/parse")

In [12]:
x.content

b'{"count":1}\n'