This notebook contains the data analysis for RQ3

In [11]:
#Configuration

#Importing libraries
import json
import functools
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import pandas as pd
import random

# Setting color palette for sns
colors = sns.color_palette()

smellyFiles="smellyFiles"
totalSmellsPresent="totalSmellsPresent"
smellsPerFile="smellsPerFile"

currentFile = "merged_v2_run.json"

# Reading and transforming the file to a JSON dict
f = open(currentFile)
data = json.load(f)
data[totalSmellsPresent].sort(key=lambda x: x["times"], reverse=True)
data[smellyFiles].sort(key=lambda x: x["times"], reverse=True)

filenames = list(map(lambda x: x["filename"],
                     list(filter(lambda x: any(s["rule"]=="DL3034" for s in x["smells"]), 
                                 data[smellsPerFile]))))

print(random.sample(filenames, 10))


['20803Dockerfile_23f9c4_parfum.Dockerfile', '22950Dockerfile_27cb0e_parfum.Dockerfile', '116701a7a0606d22bde3b409f6c156c27649bf5306cf2e_binnacle.Dockerfile', '30382Dockerfile_34b704_parfum.Dockerfile', '140027Dockerfile_f685fc_parfum.Dockerfile', '6866962952a5a193d4d9482e7439d14d3bf99d020cd35_binnacle.Dockerfile', '119261ab389e0e751fbbf7c91a55655339b12e207d362a_binnacle.Dockerfile', '164639ec37a1726e24bca1c965efe30e452ae06cdd997d_binnacle.Dockerfile', '140879Dockerfile_f803e2_parfum.Dockerfile', '93619Dockerfile_a4596f_parfum.Dockerfile']


In [2]:
# Calculate total amount of smells detected
total_amount_of_smells_detected = sum(map(lambda a: a["times"], data[totalSmellsPresent]))
print("Total amount of smells: ", total_amount_of_smells_detected)

# Calculate files with smells and files without smells (and numbers)
files_with_smells = list(filter(lambda x:len(x["smells"]) > 0, data[smellsPerFile]))
print("Files with smells: ", len(files_with_smells))

files_without_smells = list(filter(lambda x:len(x["smells"]) == 0, data[smellsPerFile]))
print("Files without smells: ", len(files_without_smells))

total_files = len(files_with_smells) + len(files_without_smells)
real_total_files = len(data[smellsPerFile])

print("Total amount of files: ", real_total_files)

if(total_files == real_total_files):
    print("These numbers are consistent with the data")
else:
    print("These numbers are inconsistent with the data")

Total amount of smells:  2293990
Files with smells:  205199
Files without smells:  118148
Total amount of files:  323347
These numbers are consistent with the data


In [3]:
# Smell detection proportion dataframe
smell_proportion_df = pd.DataFrame({
    'Smell': list(map(lambda x: x["rule"], data[totalSmellsPresent])),
    '# Detections': list(map(lambda x: x["times"], data[totalSmellsPresent])),
    'Proportion (%)': list(map(lambda x: x["times"] / total_amount_of_smells_detected, data[totalSmellsPresent]))})

print("Smell proportions table:")
print(smell_proportion_df)

Smell proportions table:
     Smell  # Detections  Proportion (%)
0   DL3008       1012020        0.441161
1   DL3018        271522        0.118362
2   DL9011        187811        0.081871
3   DL3015        162548        0.070858
4   DL3059        122396        0.053355
5   DL3013         71379        0.031116
6   DL9005         63992        0.027896
7   DL9001         53464        0.023306
8   DL3042         52424        0.022853
9   DL3033         48090        0.020963
10  DL9008         28122        0.012259
11  DL9002         24824        0.010821
12  DL9018         23300        0.010157
13  DL3019         21062        0.009181
14  DL9004         20172        0.008793
15  DL9009         17023        0.007421
16  DL3016         16830        0.007337
17  DL3041         16060        0.007001
18  DL3032         14710        0.006412
19  DL9020         10315        0.004497
20  DL9012         10012        0.004364
21  DL9006          9453        0.004121
22  DL3014          7053        

In [4]:
# Smell detection files dataframe
file_proportion_df = pd.DataFrame({
    'Smell': list(map(lambda x: x["rule"], data[smellyFiles])),
    '# Files': list(map(lambda x: x["times"], data[smellyFiles])),
    'Proportion (%)': list(map(lambda x: x["times"] / real_total_files, data[smellyFiles]))})

print("File proportions table:")
print(file_proportion_df)

File proportions table:
     Smell  # Files  Proportion (%)
0   DL3008   102805        0.317940
1   DL9011    88203        0.272781
2   DL3015    74133        0.229268
3   DL3018    45859        0.141826
4   DL3059    36745        0.113640
5   DL9005    35086        0.108509
6   DL3042    29883        0.092418
7   DL3013    22287        0.068926
8   DL9008    21590        0.066770
9   DL9018    16392        0.050695
10  DL3019    16047        0.049628
11  DL9004    11511        0.035600
12  DL3033    10108        0.031261
13  DL3016    10042        0.031056
14  DL9009     9858        0.030487
15  DL9002     8213        0.025400
16  DL9020     7336        0.022688
17  DL9001     7059        0.021831
18  DL3032     7033        0.021751
19  DL9012     6724        0.020795
20  DL9006     6441        0.019920
21  DL3014     4756        0.014709
22  DL9000     4274        0.013218
23  DL3060     3695        0.011427
24  DL3041     3321        0.010271
25  DL9010     1830        0.005660
26  

In [5]:
# Layer optimization proportion in relevant npm files
total_files_with_npm_install = 25609
total_detections_npm_in_files = file_proportion_df.loc[file_proportion_df['Smell'] == 'DL9000'].iloc[0, 1]

proportion_npm_DL9000 = total_detections_npm_in_files / total_files_with_npm_install
print("Proportion DL9000: ", proportion_npm_DL9000)

Proportion DL9000:  0.1668944511695107


In [6]:
# Layer optimization proportion in relevant python files
files_with_pip_install = 10516
files_with_pip3_install = 3159
total_files_pip_install = files_with_pip_install + files_with_pip3_install
total_detections_pip_in_files = file_proportion_df.loc[file_proportion_df['Smell'] == 'DL9020'].iloc[0, 1]

proportion_npm_DL9020 = total_detections_pip_in_files / total_files_pip_install
print("Proportion DL9020: ", proportion_npm_DL9020)

Proportion DL9020:  0.5364533820840951
