In [3]:
import csv
import os
import glob
import pandas as pd

# BIG Notice!!!
All the output of the file is what I received from CCGrep tool manually. So the codes below just illustrate how my project works. 

# -----------------------------------------DATA CLEANING PART------------------------------------
#### The function is used for cleaning up the data and transform it into form of CSV file
#### To call the function replace "fin" with the address of the target file that is needed to be cleaned and "fout" with the address of the expected output with the desired filename


In [1]:

def clean_up_csv(fin,fout):

    with open(fin, "r") as f:
        data = list(csv.reader(f))
        data = [[i[0].replace(":",","),i[1]] for i in data]
        

    with open(fout, "w", newline='') as f:
        print("Package,Filename,Count,Date,Version",file=f)
        for row in data:
                temp=row[0].split(',')
                temp2 = temp[1].split('{')
                temp3=temp[0].split('/')
                if "0" not in temp2[0]:
                    print(temp3[len(temp3)-2]+","+temp3[len(temp3)-1]+","+temp2[0]+","+temp2[1]+","+row[1],file = f)             
    return fout


#### To plot the graph, the the axes must contain all of the ranges in every file 
#### To call the fuction, replace "dire" with the address of the folder that all of the needed files located and replace "fname" with what ever you want to name the new combined file

In [2]:
#This function is for combining all the dataset used in the graph so the axes contain all of the ranges we need 
def combine_csv(dire,fname):
    
    os.chdir(dire)
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    #export to csv
    combined_csv.to_csv(fname,index=False, encoding='utf-8-sig')
    return dire + "/" + fname
    

# ---------------------------------------GRAPH PLOTTING PART------------------------------------

In [3]:
import pandas as pd
import numpy as np
from math import pi
from datetime import datetime as dt
from bokeh.io import output_notebook,show
from bokeh.models import DatetimeTickFormatter,ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import jitter
from bokeh.layouts import column,gridplot


#### This is Graph 1: Non-idiomatic and Idiomatic Code Occurences in Packages of a Project

#### To call function, replace "cbad" with file address of the cleaned non-idiomatic output, "cgood" with file address of the cleaned idiomatic output, and "ctotal" with address of the combined good and bad output

In [4]:
def plot_graph1(cbad,cgood,ctotal):
    
    output_notebook()

    bad = pd.read_csv(cbad)
    bad['Date'] = pd.to_datetime(bad['Date'],format='%Y-%m-%d')
    bad['Count'] = bad['Count']*10

    good = pd.read_csv(cgood)
    good['Date'] = pd.to_datetime(good['Date'],format='%Y-%m-%d')
    good['Count'] = good['Count']*10

    total = pd.read_csv(ctotal)

    sourcebad = ColumnDataSource(bad)
    sourcegood = ColumnDataSource(good)

    packs = total['Package'].unique().astype('str')


    p=figure(x_axis_type='datetime',y_range = packs,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Packages of a Project",tools=['wheel_zoom','pan','reset'])


    p.circle(x='Date', y=jitter('Package', width=0.6, range=p.y_range),size =10, source=bad,color='red',alpha=0.2,legend = 'Non-idiomatic')
    p.circle(x='Date', y=jitter('Package', width=0.6, range=p.y_range),size =10, source=good,color='green',alpha=0.2,legend = 'Idiomatic')

    p.yaxis.axis_label = "Packages"
    p.xaxis.axis_label = "Time"
    p.xaxis.formatter=DatetimeTickFormatter(days = ["%Y/%m/%d" ])

    p.legend.location = "top_right"
    p.legend.click_policy="hide"
    show(p)




### Before using this graph, please extract files first 
This is the how to extract output files into versions manually

 If the project contains only good or bad idioms, go to case 1.
 If the project contains both good and bad idioms, go to case 2.
 
 Case 1:
 1. Create a new file
 2. Insert the heading for the columns  --->> Package,Filename,Count,Date,Version 
 3. Open the target output file that is needed to be extracted
 4. Copy every rows of the first version that are shown in the target output file 
 5. Paste them in the new created file.
 6. Do step 1 to 5 for the last version also
 
 Case 2:
 1. Create two new files, one for the good and one for the bad.
 2. Insert the heading for the columns  --->> Package,Filename,Count,Date,Version in both two files
 3. Open the both good and bad cleaned output.
 4. Compare the first version of both files. If they are the same version, copy and paste the first version of both good and bad in the new files.
 5. If they are not the same, use the older version of one of the file and copy and past that one in the new file depends on if it is good or bad output
 6. Do step 1 to 5 for the last version but in step 5, change from use the older one to the newer one.
 

#### This is Graph 2:  Non-idiomatic and Idiomatic Code Occurences in Each Files of the First and Last Version
#### To call the functoin, replace "badf" with the address of the first non-diomatic file, replace "badl" with the address of the last non-idiomatic version file, replace "goodf" with the address of the first idiomatic version file, replace "goodl" with the address of the last idiomatic version file, and replace "comb" with the address of the file that combines all of the first 4 previous files

In [5]:
def plot_graph2(badf,badl,goodf,goodl,comb):
    
    output_notebook()

    #read from file
    badfirst = pd.read_csv(badf)
    badlast = pd.read_csv(badl)
    goodfirst = pd.read_csv(goodf)
    goodlast = pd.read_csv(goodl)
    total = pd.read_csv(comb)

    sourcebadfirst = ColumnDataSource(badfirst)
    sourcebadlast = ColumnDataSource(badlast)
    sourcegoodfirst = ColumnDataSource(goodfirst)
    sourcegoodlast = ColumnDataSource(goodlast)
    sourcetotal = ColumnDataSource(total)

    count = total['Count'].unique()
    file = total['Filename'].unique()

    left = figure(x_range = file,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Each Files of the First Version",
                  tools=['wheel_zoom','pan','reset'])
    right = figure(x_range = file,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Each Files of the Last Version",
                   tools=['wheel_zoom','pan','reset'])


    left.yaxis.axis_label = "Count"
    left.xaxis.axis_label = "Filename"
    left.xaxis.major_label_orientation = np.pi/3
    left.yaxis.major_label_orientation = np.pi/3

    right.yaxis.axis_label = "Count"
    right.xaxis.axis_label = "Filename"
    right.xaxis.major_label_orientation = np.pi/3
    right.yaxis.major_label_orientation = np.pi/3


    left.vbar(x='Filename', top='Count', source = badfirst,width=0.9,color='red',alpha=0.2,legend ='Non-idiomatic')
    left.vbar(x='Filename', top='Count', source = goodfirst,width=0.9,color='green',alpha=0.2,legend ='Idiomatic')
    right.vbar(x='Filename', top='Count', source = badlast,width=0.9,color='red',alpha=0.2,legend ='Non-idiomatic')
    right.vbar(x='Filename', top='Count', source = goodlast,width=0.9,color='green',alpha=0.2,legend ='Idiomatic')

    left.y_range.start = 0
    right.y_range.start = 0

    left.legend.location = "top_right"
    left.legend.click_policy="hide"
    right.legend.location = "top_right"
    right.legend.click_policy="hide"

    p = gridplot([[left, right]])
    #show(column(left, right))
    show(p)

In [6]:
#This is the example of how the functions are called

#Declaring functions
def clean(fout):
    return fout
def combine(dir,name):
    return dir + "/" +name
def plot(good,bad,total):
    print(good)
    print(bad)
    print(total)
#1.clean the data first   
aa = clean(fout='good')
bb = clean(fout='bad')
#2.combine those two
cc = combine(dir='toto/lala',name='tata')
#3.plot the graph    
plot(good = aa,bad =bb,total = cc)

good
bad
toto/lala/tata


In [7]:
#clean data

cleanbad = clean_up_csv(fin = "your csv file directory",
                        fout = "your csv file directory")
cleangood = clean_up_csv(fin ="your csv file directory",
                         fout = "your csv file directory")

#combine data
combine = combine_csv(dire = "your file director",fname = "your file name")
#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



FileNotFoundError: [Errno 2] No such file or directory: 'your csv file directory'

In [None]:
plot_graph2(badf="your csv file directory",
            badl="your csv file directory",
            goodf="your csv file directory",
            goodl="your csv file directory",
            comb="your csv file directory")