# Exploring the Data
1. Load the data
2. Clean
3. Graph
4. Tables with statistics

In [2]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

In [3]:

df = pd.read_csv('COCOMO-revised.csv', index_col=1)
df.drop('Red.br', axis=1, inplace=True)

In [4]:
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()


In [5]:
df.tail()

Unnamed: 0_level_0,e,pemi,size_kloc,act_effort,categorical_output,shorter_convetion_for_output
Proj.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
92,1.1363,3.547812,65.0,1772.5,large,L
93,1.1363,3.547812,70.0,1645.9,large,L
94,1.1363,4.60686,50.0,1924.5,large,L
97,1.1675,3.997406,7.25,648.0,large,L
98,1.1363,3.547812,233.0,8211.0,large,L


In [6]:
fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=list(df.columns[:-2]),
                        fill_color='paleturquoise',
                        align='left'),
            cells=dict(values=[df.e, df.pemi, df.size_kloc, df.act_effort], 
                       fill_color='lavender',
                       align='left'))
])
fig.show()

In [7]:
fig = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Descriptive", "ACT_EFFORT"],
                        fill_color='#1f77b4',
                        align='left',
                        font=dict(size=16, weight=700, color="white"),
                        height=30,),
            cells=dict(values=[["Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max"], [df.act_effort.count(), round(df.act_effort.mean(), 3), 
                        round(df.act_effort.std(), 3), df.act_effort.min(), df.act_effort.quantile(0.25), df.act_effort.quantile(0.5),
                        df.act_effort.quantile(0.75), df.act_effort.max()]], 
                        fill_color=[["#e6f2ff", "#f8f8f8"] * 6],
                        align='left',
                        font=dict(size=14, color="black"),
                        height=30))
])
fig.update_layout(width=500, height=600)
fig.show()

In [8]:
df_small = df[df.categorical_output == "small"]
df_medium = df[df.categorical_output == "medium"]
df_large = df[df.categorical_output == "large"]

In [9]:
dataset_col = 4 * ["Small subset"] + 4 * ["Medium subset"] + 4 * ["Large subset"]    
features_col = 3 * ["E", "PEMI", "Size (KLOC)", "ACT_EFFORT"]
no_projects_col= 4 * [df_small.shape[0]] + 4 * [df_medium.shape[0]] + 4 * [df_large.shape[0]]
min_col = [round(df_small.e.min(), 3), round(df_small.pemi.min(), 3), round(df_small.size_kloc.min(), 3), round(df_small.act_effort.min(), 3),
           round(df_medium.e.min(), 3), round(df_medium.pemi.min(), 3), round(df_medium.size_kloc.min(), 3), round(df_medium.act_effort.min(), 3),
           round(df_large.e.min(), 3), round(df_large.pemi.min(), 3), round(df_large.size_kloc.min(), 3), round(df_large.act_effort.min(), 3)]
max_col = [round(df_small.e.max(), 3), round(df_small.pemi.max(), 3), round(df_small.size_kloc.max(), 3), round(df_small.act_effort.max(), 3),
           round(df_medium.e.max(), 3), round(df_medium.pemi.max(), 3), round(df_medium.size_kloc.max(), 3), round(df_medium.act_effort.max(), 3),
           round(df_large.e.max(), 3), round(df_large.pemi.max(), 3), round(df_large.size_kloc.max(), 3), round(df_large.act_effort.max(), 3)]
mean_col = [round(df_small.e.mean(), 3), round(df_small.pemi.mean(), 3), round(df_small.size_kloc.mean(), 3), round(df_small.act_effort.mean(), 3),
            round(df_medium.e.mean(), 3), round(df_medium.pemi.mean(), 3), round(df_medium.size_kloc.mean(), 3), round(df_medium.act_effort.mean(), 3),
            round(df_large.e.mean(), 3), round(df_large.pemi.mean(), 3), round(df_large.size_kloc.mean(), 3), round(df_large.act_effort.mean(), 3)]
std_col = [round(df_small.e.std(), 3), round(df_small.pemi.std(), 3), round(df_small.size_kloc.std(), 3), round(df_small.act_effort.std(), 3),
           round(df_medium.e.std(), 3), round(df_medium.pemi.std(), 3), round(df_medium.size_kloc.std(), 3), round(df_medium.act_effort.std(), 3),
           round(df_large.e.std(), 3), round(df_large.pemi.std(), 3), round(df_large.size_kloc.std(), 3), round(df_large.act_effort.std(), 3)]


In [10]:
print(dataset_col)

['Small subset', 'Small subset', 'Small subset', 'Small subset', 'Medium subset', 'Medium subset', 'Medium subset', 'Medium subset', 'Large subset', 'Large subset', 'Large subset', 'Large subset']


In [11]:
fig = go.Figure(
    data=[
        go.Table(
            columnwidth=[80, 80, 80, 50, 50, 50, 50],
            header=dict(
                values=[
                    "<b>Dataset</b>",
                    "<b>Features</b>",
                    "<b>No. of projects</b>",
                    "<b>Min.</b>",
                    "<b>Max.</b>",
                    "<b>Mean</b>",
                    "<b>Std.</b>"
                ],
                fill_color="#1f77b4",       # Header background color
                align="left",
                font=dict(color="white", size=14),
                height=35,
                line_color="white"
            ),
            cells=dict(
                values=[
                    dataset_col,
                    features_col,
                    no_projects_col,
                    min_col,
                    max_col,
                    mean_col,
                    std_col
                ],
                # Create a striped effect by passing a 2D list for fill_color:
                fill_color=[["#e6f2ff", "#f8f8f8"] * 6],  # repeats for each column
                align="left",
                font=dict(color="black", size=12),
                height=25,
                line_color="white"
            )
        )
    ]
)

fig.update_layout(
    width=800,     # Adjust width to fit nicely in Word
    height=400,    # Adjust height to fit nicely in Word
    margin=dict(l=20, r=20, t=20, b=20)
)

fig.write_image("high_res_table.png", width=800, height=400, scale=3)
fig.show()

In [12]:
features_col = ["E", "PEMI", "Size (KLOC)", "ACT_EFFORT"]
original_type_col = ["Ratio", "Ratio", "Ratio", "Ratio"]
description_col = ["Uses variables such as size, constants, scale factors, and effort multipliers to estimate the amount of work required to build a project (Rankovic et al., 2021).", "Takes into consideration the many aspects of the project that affect every stage of development (Rankovic et al., 2021). Project complexity, team experience with the technology, and the tools being used are a few examples.", "Evaluates software size per thousand of lines of code (Rankovic et al., 2021).", "Measures how much effort is actually required to finish a software development project (Rankovic et al., 2021)."]
example_col = [1.0724, 1.361, 352, 1200]

In [22]:
fig = go.Figure(
    data=[
        go.Table(
            columnwidth=[120, 110, 200, 120],
            header=dict(values=["<b>Name of feature</b>", "<b>Original type</b>", "<b>Description</b>", "<b>Example value</b>"],
                        fill_color='#1f77b4',
                        align='left',
                        font=dict(color="white", size=20),
                        height=30,),
            cells=dict(values=[features_col, original_type_col, description_col, example_col],
                        fill_color=[["#e6f2ff", "#f8f8f8"] * 6],
                        align='left',
                        font=dict(weight=500,size=18, color="black"),
                        height=40,))
])
fig.update_layout(width=1300, height=800, margin=dict(l=20, r=20, t=20, b=20) )
fig.write_image("high_res_description_table.png", width=1300, height=600, scale=2)
fig.show()