This notebook contains the data analysis for RQ3

In [1]:
#Configuration

#Importing libraries
import json
import functools
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import pandas as pd

# Setting color palette for sns
colors = sns.color_palette()

smellyFiles="smellyFiles"
totalSmellsPresent="totalSmellsPresent"
smellsPerFile="smellsPerFile"

currentFile = "sf_v1_run.json"

# Reading and transforming the file to a JSON dict
f = open(currentFile)
data = json.load(f)
data[totalSmellsPresent].sort(key=lambda x: x["times"], reverse=True)
data[smellyFiles].sort(key=lambda x: x["times"], reverse=True)

In [2]:
# Calculate total amount of smells detected
total_amount_of_smells_detected = sum(map(lambda a: a["times"], data[totalSmellsPresent]))
print("Total amount of smells: ", total_amount_of_smells_detected)

# Calculate files with smells and files without smells (and numbers)
files_with_smells = list(filter(lambda x:len(x["smells"]) > 0, data[smellsPerFile]))
print("Files with smells: ", len(files_with_smells))

files_without_smells = list(filter(lambda x:len(x["smells"]) == 0, data[smellsPerFile]))
print("Files without smells: ", len(files_without_smells))

total_files = len(files_with_smells) + len(files_without_smells)
real_total_files = len(data[smellsPerFile])

print("Total amount of files: ", real_total_files)

if(total_files == real_total_files):
    print("These numbers are consistent with the data")
else:
    print("These numbers are inconsistent with the data")

Total amount of smells:  57560
Files with smells:  7858
Files without smells:  17023
Total amount of files:  24881
These numbers are consistent with the data


In [3]:
# Smell detection proportion dataframe
smell_proportion_df = pd.DataFrame({
    'Smell': list(map(lambda x: x["rule"], data[totalSmellsPresent])),
    '# Detections': list(map(lambda x: x["times"], data[totalSmellsPresent])),
    'Proportion (%)': list(map(lambda x: x["times"] / total_amount_of_smells_detected, data[totalSmellsPresent]))})

print("Smell proportions table:")
print(smell_proportion_df)

Smell proportions table:
     Smell  # Detections  Proportion (%)
0   DL3008         15078        0.261953
1   DL3018          5206        0.090445
2   DL9011          5082        0.088290
3   DL3009          5082        0.088290
4   DL3015          4993        0.086744
5   DL3059          4919        0.085459
6   DL9005          2367        0.041122
7   DL3042          2141        0.037196
8   DL9008          2018        0.035059
9   DL3013          1536        0.026685
10  DL9012           896        0.015566
11  DL9001           864        0.015010
12  DL3019           862        0.014976
13  DL3033           843        0.014646
14  DL3016           727        0.012630
15  DL9004           686        0.011918
16  DL3032           649        0.011275
17  DL9020           638        0.011084
18  DL9009           617        0.010719
19  DL9002           589        0.010233
20  DL9006           437        0.007592
21  DL3014           345        0.005994
22  DL9000           315        

In [4]:
# Smell detection files dataframe
file_proportion_df = pd.DataFrame({
    'Smell': list(map(lambda x: x["rule"], data[smellyFiles])),
    '# Files': list(map(lambda x: x["times"], data[smellyFiles])),
    'Proportion (%)': list(map(lambda x: x["times"] / real_total_files, data[smellyFiles]))})

print("File proportions table:")
print(file_proportion_df)

File proportions table:
     Smell  # Files  Proportion (%)
0   DL3008     3128        0.125718
1   DL9011     3003        0.120695
2   DL3015     2910        0.116957
3   DL3059     1848        0.074274
4   DL9005     1610        0.064708
5   DL9008     1588        0.063824
6   DL3042     1470        0.059081
7   DL3018     1128        0.045336
8   DL3013      744        0.029902
9   DL3019      621        0.024959
10  DL3016      460        0.018488
11  DL9020      448        0.018006
12  DL9004      437        0.017564
13  DL9009      398        0.015996
14  DL9000      302        0.012138
15  DL3032      297        0.011937
16  DL9012      288        0.011575
17  DL9001      284        0.011414
18  DL3014      284        0.011414
19  DL9006      282        0.011334
20  DL3033      271        0.010892
21  DL9002      253        0.010168
22  DL3060      174        0.006993
23  DL9018      136        0.005466
24  DL9014       50        0.002010
25  DL9003       41        0.001648
26  

In [5]:
# Layer optimization proportion in relevant npm files
total_files_with_npm_install = 1673
total_detections_npm_in_files = file_proportion_df.loc[file_proportion_df['Smell'] == 'DL9000'].iloc[0, 1]

proportion_npm_DL9000 = total_detections_npm_in_files / total_files_with_npm_install
print("Proportion DL9000: ", proportion_npm_DL9000)

Proportion DL9000:  0.18051404662283324


In [6]:
# Layer optimization proportion in relevant python files
files_with_pip_install = 876
files_with_pip3_install = 210
total_files_pip_install = files_with_pip_install + files_with_pip3_install
total_detections_pip_in_files = file_proportion_df.loc[file_proportion_df['Smell'] == 'DL9020'].iloc[0, 1]

proportion_npm_DL9020 = total_detections_pip_in_files / total_files_pip_install
print("Proportion DL9020: ", proportion_npm_DL9020)

Proportion DL9020:  0.4125230202578269
