## DBSCAN

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from joblib import dump, load
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('/home/idies/workspace/Temporary/jaclar15/scratch/nicer/spectra2.csv')
df = df.sort_values(by='TIME', ascending=True)
df.TIME = df.TIME - df.TIME.iloc[0]
df = df.set_index('TIME')
df.columns = pd.to_numeric(df.columns, downcast='integer')

In [None]:
# Split the dataset into a training set (80%) and a testing set (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Define eps values to try
# eps_values = [0.1, 0.5, 1, 2, 4]
eps_values = [0.5]

# Fit and plot DBSCAN clustering for each eps value
fig, axs = plt.subplots(1, len(eps_values), figsize=(25, 5))
for i, eps in enumerate(eps_values):
    dbscan = DBSCAN(eps=eps, n_jobs=-1)
    # dump to a pickle file
    dump(dbscan, open('untrained_dbscan_cluster-job.joblib', 'wb'))
    y_pred = dbscan.fit_predict(train_df)
    # dump to a pickle file
    dump(y_pred, open('trained_dbscan_cluster-job.joblib', 'wb'))
    cluster_std = test_df.groupby(y_pred).agg(np.std)
    df.groupby(y_pred).agg('mean').T.plot(ax=axs[i], yerr=cluster_std.T)  # Pass the 'Axes' object to the 'plot' function
    axs[i].set_yscale('log')
    axs[i].set_xscale('log')
    axs[i].set_xlabel('log PI')
    axs[i].set_ylabel('log mean counts')
    axs[i].set_title(f'eps: {eps}')

fig.savefig('dbscan-comparison-job.jpeg')