In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

df = pd.read_csv("full_dataset_NO_missing_values.csv")

print("Eilučių skaičius:", len(df))
print("Stulpeliai:", list(df.columns))


In [None]:
def outliers_function(data, column, factor):
    col = pd.to_numeric(data[column]).astype(float)
    Q1, Q3 = col.quantile(0.25), col.quantile(0.75)
    IQR = Q3 - Q1

    lower_15 = Q1 - 1.5 * IQR
    upper_15 = Q3 + 1.5 * IQR
    lower_3 = Q1 - 3 * IQR
    upper_3 = Q3 + 3 * IQR

    if factor == 3:
        out = col[(col < lower_3) | (col > upper_3)]
    elif factor == 1.5:
        out = col[((col < lower_15) & (col >= lower_3)) |
                  ((col > upper_15) & (col <= upper_3))]
    else:
        out = pd.Series(dtype=float)
    return out.index


In [None]:
# Požymiai be labeliu
feature_columns = [c for c in df.columns if c != 'label']

conditional_outliers = set()
extreme_outliers = set()

# isskirtys
for col in feature_columns:
    cond_idx = outliers_function(df, col, 1.5)
    extr_idx = outliers_function(df, col, 3)
    conditional_outliers.update(cond_idx)
    extreme_outliers.update(extr_idx)

# stulpelis outlier
df['outlier'] = df['label']
df.loc[df.index.isin(conditional_outliers), 'outlier'] = 3
df.loc[df.index.isin(extreme_outliers), 'outlier'] = 4

print("Išskirtys pagal kategorijas:")
print(df['outlier'].value_counts())


In [None]:
X = df.drop(columns=['label', 'outlier']).values
y_class = df['label'].values
y_group = df['outlier'].values


In [None]:
def stress_function(X_high, X_low):
   #stress matuoja issibarstyma
    D_high = pairwise_distances(X_high)
    D_low = pairwise_distances(X_low)
    numerator = np.sum((D_high - D_low)**2)
    denominator = np.sum(D_high**2)
    return np.sqrt(numerator / denominator)

def continuity(X_high, X_low, k=10):
   #continuity matuoja kaimynus
    n = X_high.shape[0]
    nn_high = NearestNeighbors(n_neighbors=k).fit(X_high)
    nn_low = NearestNeighbors(n_neighbors=k).fit(X_low)
    neigh_high = nn_high.kneighbors(return_distance=False)
    neigh_low = nn_low.kneighbors(return_distance=False)

    ranks_low = np.zeros((n, n))
    for i in range(n):
        for rank, j in enumerate(neigh_low[i]):
            ranks_low[i, j] = rank + 1

    cont_sum = 0
    for i in range(n):
        for j in neigh_high[i]:
            if j not in neigh_low[i]:
                cont_sum += ranks_low[i, j] - k

    denom = n * k * (2 * n - 3 * k - 1)
    return 1 - (2 / denom) * cont_sum


In [None]:
param_grid = {
    "perplexity": [5, 20, 30, 50],
    "learning_rate": [50, 100, 200, 400],
    "early_exaggeration": [4, 12, 24]
}


In [None]:
results = []

for perplexity in param_grid["perplexity"]:
    for lr in param_grid["learning_rate"]:
        for exaggeration in param_grid["early_exaggeration"]:
            print(f"\nTestuojama: perplexity={perplexity}, lr={lr}, exaggeration={exaggeration}")

            tsne = TSNE(
                n_components=2,
                perplexity=perplexity,
                learning_rate=lr,
                early_exaggeration=exaggeration,
                n_iter=1000,
                metric="euclidean",
                init="pca",
                random_state=67
            )

            X_embedded = tsne.fit_transform(X)

            stress_val = stress_function(X, X_embedded)
            continuity_val = continuity(X, X_embedded, k=10)

            results.append({
                "perplexity": perplexity,
                "learning_rate": lr,
                "early_exaggeration": exaggeration,
                "stress": stress_val,
                "continuity": continuity_val
            })


            plt.figure(figsize=(5, 5))
            plt.scatter(
                X_embedded[:, 0], X_embedded[:, 1],
                c=y_class, cmap="viridis", s=10, alpha=0.7
            )
            plt.title(
                f"perplexity={perplexity}, lr={lr}, exaggeration={exaggeration}\n"
                f"continuity={continuity_val:.3f}, stress={stress_val:.3f}"
            )
            plt.xlabel("t-SNE dim 1")
            plt.ylabel("t-SNE dim 2")
            plt.grid(True, linestyle='--', alpha=0.3)
            plt.tight_layout()
            plt.show()


In [None]:

tsne_best = TSNE(
    n_components=2,
    perplexity=30,
    learning_rate=200,
    early_exaggeration=12,
    n_iter=1000,
    metric="euclidean",
    init="pca",
    random_state=67
)

X_tsne = tsne_best.fit_transform(X)


color_map = {
    (0, 'normal'): 'lightsalmon',
    (0, 'conditional'): 'tomato',
    (0, 'extreme'): 'red',

    (1, 'normal'): 'cornflowerblue',
    (1, 'conditional'): 'royalblue',
    (1, 'extreme'): 'navy',

    (2, 'normal'): 'lime',
    (2, 'conditional'): 'limegreen',
    (2, 'extreme'): 'forestgreen'
}


markers = {
    'normal': 'o',
    'conditional': 's',
    'extreme': 'x'
}


plt.figure(figsize=(10, 10))

for i in [0, 1, 2]:
    # taskai
    plt.scatter(
        X_tsne[(y_class == i) & (y_group == i), 0],
        X_tsne[(y_class == i) & (y_group == i), 1],
        color=color_map[(i, 'normal')],
        marker=markers['normal'],
        label=f"{i} – normalūs",
        alpha=0.4,
    )

    # salygines issk
    plt.scatter(
        X_tsne[(y_class == i) & (y_group == 3), 0],
        X_tsne[(y_class == i) & (y_group == 3), 1],
        color=color_map[(i, 'conditional')],
        marker=markers['conditional'],
        label=f"{i} – sąlyginės",
        alpha=0.7,
    )

    # esktremalios issk
    plt.scatter(
        X_tsne[(y_class == i) & (y_group == 4), 0],
        X_tsne[(y_class == i) & (y_group == 4), 1],
        color=color_map[(i, 'extreme')],
        marker=markers['extreme'],
        label=f"{i} – ekstremalios",
        alpha=1.0,
    )


plt.title(
    f"t-SNE vizualizacija: klasės ir išskirtys\n"
    f"(perplexity={30}, lr={200}, exaggeration={12})\n"
)
plt.xlabel("t-SNE 1 dimensija")
plt.ylabel("t-SNE 2 dimensija")
plt.legend(loc="best", fontsize=9)
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()
