In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import sklearn as sk


In [22]:
ISBG = pd.read_excel("./data/ISBSG-whole.xlsx",header=3)

In [23]:
cols_needed = ['Max Team Size','COSMIC Read','COSMIC Write','COSMIC Entry','COSMIC Exit','Functional Size','Project Elapsed Time','Development Platform','Primary Programming Language','Summary Work Effort']
ISBG_interest = ISBG[cols_needed]
df_clean = ISBG_interest.dropna(subset=["COSMIC Read", "COSMIC Write", "COSMIC Exit", "COSMIC Entry"])

In [27]:
from sklearn.model_selection import train_test_split

X_clean = df_clean.drop(columns=["Summary Work Effort"])
y_clean = df_clean["Summary Work Effort"]
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(exclude='number').columns

X_num_train = X_train[num_cols]
X_num_test = X_test[num_cols]

X_cat_train = X_train[cat_cols]
X_cat_test = X_test[cat_cols]

In [28]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

numeric_pipeline = Pipeline([
    ("imputer", KNNImputer(n_neighbors=5)),            # fit on train only
    ("log1p", FunctionTransformer(np.log1p, validate=True)),
    ("scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline,   num_cols),
    ("cat", categorical_pipeline, cat_cols),
])

X_train_prepared = preprocessor.fit_transform(
    pd.concat([X_num_train, X_cat_train], axis=1)
)
X_test_prepared  = preprocessor.transform(
    pd.concat([X_num_test,  X_cat_test],  axis=1)
)


In [28]:

tech_features = ["Development Platform", "Primary Programming Language",
                 "Project Elapsed Time", "Max Team Size"]

functional_features = ["COSMIC Read", "COSMIC Write", "COSMIC Entry", "COSMIC Exit"]

# Split training and test sets
X_tech_train = X_train_prepared[tech_features]
X_tech_test = X_test_prepared[tech_features]

X_func_train = X_train_prepared[functional_features]
X_func_test = X_test_prepared[functional_features]
