In [3]:
import csv
import os
import glob
import pandas as pd

# -----------------------------------------DATA CLEANING PART------------------------------------
#### The function is used for cleaning up the data and transform it into form of CSV file
#### To call the function replace "fin" with the address of the target file that is needed to be cleaned and "fout" with the address of the expected output with the desired filename


In [4]:

def clean_up_csv(fin,fout):

    with open(fin, "r") as f:
        data = list(csv.reader(f))
        data = [[i[0].replace(":",","),i[1]] for i in data]
        

    with open(fout, "w", newline='') as f:
        print("Package,Filename,Count,Date,Version",file=f)
        for row in data:
                temp=row[0].split(',')
                temp2 = temp[1].split('{')
                temp3=temp[0].split('/')
                if "0" not in temp2[0]:
                    print(temp3[len(temp3)-2]+","+temp3[len(temp3)-1]+","+temp2[0]+","+temp2[1]+","+row[1],file = f)             
    return fout


#### To plot the graph, the the axes must contain all of the ranges in every file 
#### To call the fuction, replace "dire" with the address of the folder that all of the needed files located and replace "fname" with what ever you want to name the new combined file

In [5]:
#This function is for combining all the dataset used in the graph so the axes contain all of the ranges we need 
def combine_csv(dire,fname):
    
    os.chdir(dire)
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    #combine all files in the list
    combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
    #export to csv
    combined_csv.to_csv(fname,index=False, encoding='utf-8-sig')
    return dire + "/" + fname
    

# ---------------------------------------GRAPH PLOTTING PART------------------------------------

In [6]:
import pandas as pd
import numpy as np
from math import pi
from datetime import datetime as dt
from bokeh.io import output_notebook,show
from bokeh.models import DatetimeTickFormatter,ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import jitter
from bokeh.layouts import column,gridplot


#### This is Graph 1: Good and Bad Idioms Occurences in Packages of a Project
#### To call function, replace "cbad" with file address of the cleaned bad idioms output, "cgood" with file address of the cleaned good idioms output, and "ctotal" with address of the combined good and bad output

In [7]:
def plot_graph1(cbad,cgood,ctotal):
    
    output_notebook()

    bad = pd.read_csv(cbad)
    bad['Date'] = pd.to_datetime(bad['Date'],format='%Y-%m-%d')
    bad['Count'] = bad['Count']*10

    good = pd.read_csv(cgood)
    good['Date'] = pd.to_datetime(good['Date'],format='%Y-%m-%d')
    good['Count'] = good['Count']*10

    total = pd.read_csv(ctotal)

    sourcebad = ColumnDataSource(bad)
    sourcegood = ColumnDataSource(good)

    packs = total['Package'].unique().astype('str')


    p=figure(x_axis_type='datetime',y_range = packs,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Packages of a Project",tools=['wheel_zoom','pan','reset'])


    p.circle(x='Date', y=jitter('Package', width=0.6, range=p.y_range),size =10, source=bad,color='red',alpha=0.2,legend = 'Non-idiomatic')
    p.circle(x='Date', y=jitter('Package', width=0.6, range=p.y_range),size =10, source=good,color='green',alpha=0.2,legend = 'Idiomatic')

    p.yaxis.axis_label = "Packages"
    p.xaxis.axis_label = "Time"
    p.xaxis.formatter=DatetimeTickFormatter(days = ["%Y/%m/%d" ])

    p.legend.location = "top_right"
    p.legend.click_policy="hide"
    show(p)




### Before using this graph, please extract files first >> https://github.com/NAIST-SE/InternMUKU19/blob/master/tattiya/How%20to%20version.txt <<

#### This is Graph 2:  Good and Bad Idioms Occurences in Each Files of the First and Last Version
#### To call the functoin, replace "badf" with the address of the first bad version file, replace "badl" with the address pf the last bad version file, replace "goodf" with the address of the first good version file, replace "goodl" with the address of the last good version file, and replace "comb" with the address of the file that combines all of the first 4 previous files

In [8]:
def plot_graph2(badf,badl,goodf,goodl,comb):
    
    output_notebook()

    #read from file
    badfirst = pd.read_csv(badf)
    badlast = pd.read_csv(badl)
    goodfirst = pd.read_csv(goodf)
    goodlast = pd.read_csv(goodl)
    total = pd.read_csv(comb)

    sourcebadfirst = ColumnDataSource(badfirst)
    sourcebadlast = ColumnDataSource(badlast)
    sourcegoodfirst = ColumnDataSource(goodfirst)
    sourcegoodlast = ColumnDataSource(goodlast)
    sourcetotal = ColumnDataSource(total)

    count = total['Count'].unique()
    file = total['Filename'].unique()

    left = figure(x_range = file,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Each Files of the First Version",
                  tools=['wheel_zoom','pan','reset'])
    right = figure(x_range = file,plot_width = 900,title="Non-idiomatic and Idiomatic Code Occurences in Each Files of the Last Version",
                   tools=['wheel_zoom','pan','reset'])


    left.yaxis.axis_label = "Count"
    left.xaxis.axis_label = "Filename"
    left.xaxis.major_label_orientation = np.pi/3
    left.yaxis.major_label_orientation = np.pi/3

    right.yaxis.axis_label = "Count"
    right.xaxis.axis_label = "Filename"
    right.xaxis.major_label_orientation = np.pi/3
    right.yaxis.major_label_orientation = np.pi/3


    left.vbar(x='Filename', top='Count', source = badfirst,width=0.9,color='red',alpha=0.2,legend ='Non-idiomatic')
    left.vbar(x='Filename', top='Count', source = goodfirst,width=0.9,color='green',alpha=0.2,legend ='Idiomatic')
    right.vbar(x='Filename', top='Count', source = badlast,width=0.9,color='red',alpha=0.2,legend ='Non-idiomatic')
    right.vbar(x='Filename', top='Count', source = goodlast,width=0.9,color='green',alpha=0.2,legend ='Idiomatic')

    left.y_range.start = 0
    right.y_range.start = 0

    left.legend.location = "top_right"
    left.legend.click_policy="hide"
    right.legend.location = "top_right"
    right.legend.click_policy="hide"

    p = gridplot([[left, right]])
    #show(column(left, right))
    show(p)

In [9]:
#This is the example of how the functions are called

#Declaring functions
def clean(fout):
    return fout
def combine(dir,name):
    return dir + "/" +name
def plot(good,bad,total):
    print(good)
    print(bad)
    print(total)
#1.clean the data first   
aa = clean(fout='good')
bb = clean(fout='bad')
#2.combine those two
cc = combine(dir='toto/lala',name='tata')
#3.plot the graph    
plot(good = aa,bad =bb,total = cc)

good
bad
toto/lala/tata


In [12]:
#clean data

cleanbad = clean_up_csv(fin = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputB/demo.csv",
                        fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandemo/demobad.csv")
cleangood = clean_up_csv(fin ="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputG/demo.csv",
                         fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandemo/demoegood.csv")

#combine data
combine = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandemo",fname = "combined_csv.csv")
#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



In [11]:
plot_graph2(badf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondemo/demoall.csv",
            badl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondemo/demoall.csv",
            goodf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondemo/demoall.csv",
            goodl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondemo/demoall.csv",
            comb="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondemo/demoall.csv")

In [13]:
#clean data

cleanbad = clean_up_csv(fin = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputB/ipydataBad.csv",
                        fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanipy/ipybad.csv")
cleangood = clean_up_csv(fin ="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputG/ipydataGood.csv",
                         fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanipy/ipygood.csv")

#combine data
combine = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanipy",fname = "combined_csv.csv")

#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



In [14]:
#comebine
combine2 = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy",fname = "combinedcsv.csv")

#plot!!!!
plot_graph2(badf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy/ipybadf.csv",
            badl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy/ipybadl.csv",
            goodf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy/ipygoodf.csv",
            goodl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy/ipygoodl.csv",
            comb="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionipy/combinedcsv.csv")

In [15]:
#clean data

cleanbad = clean_up_csv(fin = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputB/dfhack.csv",
                        fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandfhack/dfhackbad.csv")
cleangood = clean_up_csv(fin ="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputG/dfhack.csv",
                         fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandfhack/dfhackgood.csv")

#combine data
combine = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleandfhack",fname = "combined_csv.csv")

#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



In [16]:
#comebine
combine2 = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack",fname = "combinedcsv.csv")

#plot!!!!
plot_graph2(badf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack/dfhackbadf.csv",
            badl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack/dfhackbadl.csv",
            goodf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack/dfhackgoodf.csv",
            goodl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack/dfhackgoodl.csv",
            comb="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiondfhack/combinedcsv.csv")

In [17]:
#clean data

cleanbad = clean_up_csv(fin = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputB/TShock.csv",
                        fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanTShock/TShockbad.csv")
cleangood = clean_up_csv(fin ="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputG/TShock.csv",
                         fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanTShock/TShockgood.csv")

#combine data
combine = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanTShock",fname = "combined_csv.csv")

#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



In [18]:
#comebine
combine2 = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock",fname = "combinedcsv.csv")

#plot!!!!
plot_graph2(badf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock/tshockbadf.csv",
            badl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock/tshockbadl.csv",
            goodf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock/tshockgoodf.csv",
            goodl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock/tshockgoodl.csv",
            comb="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versiontshock/combinedcsv.csv")

In [21]:
#clean data

cleanbad = clean_up_csv(fin = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputB/beaker.csv",
                        fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanbeaker/beakerbad.csv")
cleangood = clean_up_csv(fin ="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/outputG/beaker.csv",
                         fout = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanbeaker/beakergood.csv")

#combine data
combine = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/clean/cleanbeaker",fname = "combined_csv.csv")

#plot!!!!
plot_graph1(cbad = cleanbad ,cgood = cleangood ,ctotal = combine)



In [22]:
#comebine
combine2 = combine_csv(dire = "C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker",fname = "combinedcsv.csv")

#plot!!!!
plot_graph2(badf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker/beakerbadf.csv",
            badl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker/beakerbadl.csv",
            goodf="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker/beakergoodf.csv",
            goodl="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker/beakergoodl.csv",
            comb="C:/Users/sutat.LAPTOP-EH92F1DT/Documents/InternMUKU19/tattiya/output/versions/versionbeaker/combinedcsv.csv")