Connect to google drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

**DOWNLOAD DATA**

From the link download "Repositories with more than 50 stars part 2" (56.8 GB)


https://jetbrains.team/p/ccrm/repositories/fl-dataset/files/docs/README.md#download 

We only use Java files:

- You can only keep "dataset-open-50-more-2/dataset/v3/languages/Java" which is around 13 GB
- Extract the java files in colab local (reading through from google drive takes more time)

In [None]:
!unzip "/content/gdrive/My Drive/Java.zip" -d "javadata"
dataDir = 'javadata/Java/.java'  

 ***IMPORT PACKAGES***

In [None]:
!pip3 install tree_sitter
!git clone https://github.com/tree-sitter/tree-sitter-java

In [4]:
from tree_sitter import Language, Parser
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

Create python dependency for tree_sitter 

In [6]:
Language.build_library(
  # Store the library in the `build` directory
  'build/my-languages.so',

  # Include one or more languages
  [
    'tree-sitter-java'
  ]
)
JAVA_LANGUAGE = Language('build/my-languages.so', 'java')
parser = Parser()
parser.set_language(JAVA_LANGUAGE)

# **Traversing AST tree of java files**


In [7]:
def traverse_tree(tree):
  cursor = tree.walk()

  reached_root = False
  while reached_root == False:
    
    yield cursor.node

    if cursor.goto_first_child():
      continue

    if cursor.goto_next_sibling():
      continue

    retracing = True
    while retracing:
      if not cursor.goto_parent():
        retracing = False
        reached_root = True

      if cursor.goto_next_sibling():
        retracing = False

In [None]:
content = []
c=0
for root, dirs, files in tqdm(os.walk(dataDir), position=0):
  for file in files:
    file_name = os.path.join(root, file)

    if os.path.isfile(file_name):
      try:
        number_of_comment = 0
        file = open(file_name, mode='r')
        number_of_lines = len(file.readlines())
        file.seek(0)
        all_of_it = file.read()
        file.close()
        ast = parser.parse(bytes(all_of_it, "utf8"))

        for node in traverse_tree(ast):
          if node.type == "comment":
            number_of_comment +=  1

        content.append((number_of_lines,number_of_comment))
        c +=1
        print(c)
      except:
        pass

#**Remove outliers**

From around 1.6 million files, only few thousand files are with more than 1K comments and bigger than 5K lines, we can so ignore them as follow:


In [None]:
#total number of java files:
file_count = len(content)

print("Total number of Java Files: " + str(file_count))
#total number of files with number of comments>1000
#print("Total number of Java Files with number of comments more than 1K: " + str(outliers))
#From around 1.6 million files, only around 200 files are with more than 1k comments, we can so ignore them as follow:

#print("Total number of Java Files with length > 10K line: " + str(outliers))
#From around 1.6 million files, only around  files are with bigger than 5K lines, we can so ignore them as follow:
content_filtered = [x for x in content if x[0]<=5000 and x[1]<1000]

# **GRAPHS PLOTTING**


Graph for how many files exists per spesific number of comments range


In [None]:
comment_count_per_file_list = [x[1] for x in content_filtered]

bins = [0, 100, 200, 300, 400, 500 ,600,700,800,900,1000]
plt.hist(comment_count_per_file_list, bins, histtype='bar', rwidth=0.7)
plt.xlabel('Number of comments')
plt.ylabel('Number of files')
plt.title("Count of files given number of comments")
plt.savefig("a.png")

###(1e6 =10^6)###

In [None]:

nocom = len([x for x in comment_count_per_file_list if x == 0] )
com = len([x for x in comment_count_per_file_list if x != 0 ])

bins = [nocom, com]
plt.bar(["number of files without any comment", "number of files with at least 1 comments"], bins, width=0.4)
plt.savefig('b.png')

In [None]:
percent = lambda part, whole: float(part) * 100 / float(whole)
percents_arr = []
size_files = []
for line in content_filtered:
  try:
    number_comment = line[1]
    size= line[0]

    size_files.append(size)
    percents_arr.append(percent(number_comment,size))
  except:
    pass
plt.xlabel('% of comments in file')
plt.ylabel('Count')
plt.title("")
bins = [0,10,20,30,40,50,60,70,80,90,100]
plt.hist(percents_arr, bins, histtype='bar', rwidth=0.7)
plt.savefig('c.png')

In [None]:
from pyparsing import ParseExpression
import plotly.express as px
#! pip install  -U kaleido
comment_count_per_file_list = []
size_files = []

for line in content_filtered:
  try:
    number_comment = line[1]
    size= line[0]
    if (number_comment)>0: 
      size_files.append(size)
      comment_count_per_file_list.append(number_comment)
  except:
      pass

fig = px.scatter(x=size_files,y=comment_count_per_file_list, width=1000, height=800, labels={'x':'number of line in file', 'y':'number of comment in the file'})
fig.show()

## **THIS APPROACH IS ONLY APPROXIMATION FOR COUNTING COMMENTS IN A GIVEN CHUNK**

- Create code chunks based on empty lines

- If a line contains "comment special character, assume it is comment:
  This way may result some non-comment line as if they are comment (such as in URL links http:// but it is an approximation that is fast)


In [None]:
result = []
c = 0
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

for root, dirs, files in tqdm(os.walk(dataDir), position=0):
  for file in files:
    file_name = os.path.join(root, file)

    if os.path.isfile(file_name):
      try:
        chunks_list =[]
        chunks = []
        comment_count = 0
        comment = False
        count = 0
        file = open(file_name, mode='r')
        lines = file.readlines()
        for line in lines:
          if not line.split(): #start a chunk
            chunks_list.append(chunks)
            chunks = []
            if comment:
              comment_count += 1
              comment =False
          else:
            chunks.append(line)
            if "//" in line or '/*' in line:
              comment = True
        c +=1   
        print(c)
        result.append((file_name , len(chunks_list), comment_count, averageLen(chunks_list)))
      except:
        pass

In [None]:
from pyparsing import ParseExpression
import plotly.express as px

x = []
y = []

result_without_filename = [(x[1], x[2]) for x in result]

for line in result_without_filename:
  count_chunks = line[0]
  count_comments = line[1]
  x.append(count_chunks)
  y.append(count_comments)
  
fig = px.scatter(x=x,y=y, width=1000, height=800, labels={'x':'number of chunks in file', 'y':'number of commented chunks'})
fig.show()

In [None]:
percent = lambda part, whole: float(part) * 100 / float(whole)
percents_arr = []
size_files = []
for line in result_without_filename:
  number_comment = line[1]
  size= line[0]
  size_files.append(size)
  if size>0:
    percents_arr.append(percent(number_comment,size))

plt.xlabel('Percentage of commented chunks in files')
plt.ylabel('Count')
plt.title("")
bins = [0,10,20,30,40,50,60,70,80,90,100]
plt.hist(percents_arr, bins, histtype='bar', rwidth=0.7)
plt.show()


In [None]:
lis = [(x[3]) for x in result]
plt.xlabel('Average length of chunks in files')
plt.ylabel('Count')
plt.title("")
bins = [0,10,20,30,40,50,60,70,80,90,100]
plt.hist(lis, bins, histtype='bar', rwidth=0.7)
plt.show()