In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import sklearn
import seaborn as sns

In [None]:
data = pd.read_csv("./data/data-sample.csv")

In [None]:
to_keep = [
    'id',
    'id_internal',
    'employees',
    'founding_year',
    'indegree',
    'outdegree',
    'innoprob',
    'sustainability_intensity',
    'ai_intensity',
    '3d_printing_intensity'
]
df = data[to_keep]

df["age_years"] = (2024 - df["founding_year"])
df.drop(columns="founding_year", inplace=True)

In [None]:
sns.heatmap(df[df.columns[2:]].corr(), cmap="YlGnBu", annot=True)

In [None]:
cols = ['indegree', 'outdegree', 'innoprob' ]
df[cols] = df[cols].fillna(df[cols].median())
cols_replace_with_zero = ['sustainability_intensity', 'ai_intensity', '3d_printing_intensity']
df[cols_replace_with_zero] = df[cols_replace_with_zero].fillna(0)
cols_select = df.columns[2:]
df[cols_select] = df[cols_select].fillna(df[cols_select].median())
df.isna().sum()

In [None]:
cols_select = df.columns[2:]
for col in cols_select:
    fig, ax = plt.subplots()
    sns.histplot(df[col], bins=100, ax=ax)
    ax.set_title(col)

In [None]:
to_log_transform = list(set(cols_select) - {"innoprob"})
df[list(to_log_transform)] = np.log(1 + df[list(to_log_transform)])

In [None]:
for col in cols_select:
    fig, ax = plt.subplots()
    sns.histplot(df[col], bins=100, ax=ax)
    ax.set_title(col)

In [None]:
X = df[df.columns[2:]].values
clf = sklearn.ensemble.IsolationForest(
    n_estimators=1000,
).fit(X)
df_score = clf.decision_function(X)
fig, ax = plt.subplots(figsize=(16, 9))
sns.histplot(df_score, ax=ax)


In [None]:

cutoff_score = -0.15
df["anomaly_score"] = df_score

select = df[df["anomaly_score"] > cutoff_score]
select.drop(columns=["anomaly_score"], inplace=True)

In [None]:
encoder = StandardScaler()
select_encoded = encoder.fit_transform(select[select.columns[2:]])
df_encoded = pd.DataFrame(select_encoded, columns=select.columns[2:])

In [None]:
df_encoded[["id", "id_internal"]] = select[["id", "id_internal"]]
df_encoded = df_encoded[['id', 'id_internal', 'employees', 'indegree', 'outdegree', 'innoprob',
                         'sustainability_intensity', 'ai_intensity',
                         '3d_printing_intensity', 'age_years', ]]


In [None]:
result_path=Path("results")
result_path.mkdir(exist_ok=True,parents=True)
df_encoded.to_csv("results/features.csv", index=False)