In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt

In [10]:

def read_swf_file(file_path):
    col_names = ['job_number', 'submit_time', 'wait_time', 'run_time', 'num_procs',
                 'avg_cpu_time', 'used_memory', 'req_procs', 'req_time', 'req_memory',
                 'status', 'user_id', 'group_id', 'exec_id', 'queue_id',
                 'partition_id', 'orig_site', 'last_run_site']
    df = pd.read_csv(file_path, comment=';', header=None, names=col_names, delim_whitespace=True)
    return df

In [11]:
filename = 'LANL-CM5-1994-4.1-cln'

file_path = "../raw_data/" + filename + ".swf"

In [12]:
dataset_features = ['wait_time', 'req_procs', 'req_memory', 'req_time', 
                    'active_jobs_count', 'total_used_procs', 'mean_used_procs', 'total_used_memory', 'mean_used_memory',
                    'queued_jobs_count', 'mean_wait_time', 'max_wait_time', 'min_wait_time', 'mean_req_procs', 'max_req_procs', 'min_req_procs',
                    'mean_req_time', 'max_req_time', 'min_req_time', 'mean_req_memory', 'max_req_memory', 'min_req_memory',
                    'completed_jobs_count', 'mean_wait_time_completed', 'max_wait_time_completed', 'min_wait_time_completed',
                    'mean_run_time_completed', 'max_run_time_completed', 'min_run_time_completed', 'mean_procs_completed',
                    'max_procs_completed', 'min_procs_completed', 'mean_memory_completed', 'max_memory_completed', 'min_memory_completed']

In [13]:
def extract_features(df, recent_interval=3600):
    def get_queued_jobs_features(row, df, recent_interval=3600):
        submit_time = row['submit_time']
        
        # Filter out the active and queued jobs
        all_active_jobs = df[(df['submit_time'] <= submit_time) & (submit_time < df['submit_time'] + df['wait_time'] + df['run_time'])]
        active_jobs = all_active_jobs[all_active_jobs['submit_time'] + all_active_jobs['wait_time'] <= submit_time]
        queued_jobs = all_active_jobs[all_active_jobs['submit_time'] + all_active_jobs['wait_time'] > submit_time]
        
        # Calculate features for active jobs
        row['active_jobs_count'] = len(active_jobs)
        row['total_used_procs'] = active_jobs['num_procs'].sum()
        row['mean_used_procs'] = active_jobs['num_procs'].mean()
        row['total_used_memory'] = active_jobs['used_memory'].sum()
        row['mean_used_memory'] = active_jobs['used_memory'].mean()

        # Calculate features for queued jobs
        row['queued_jobs_count'] = len(queued_jobs)
        row['mean_wait_time'] = queued_jobs['wait_time'].mean()
        row['max_wait_time'] = queued_jobs['wait_time'].max()
        row['min_wait_time'] = queued_jobs['wait_time'].min()
        row['mean_req_procs'] = queued_jobs['req_procs'].mean()
        row['max_req_procs'] = queued_jobs['req_procs'].max()
        row['min_req_procs'] = queued_jobs['req_procs'].min()
        row['mean_req_time'] = queued_jobs['req_time'].mean()
        row['max_req_time'] = queued_jobs['req_time'].max()
        row['min_req_time'] = queued_jobs['req_time'].min()
        row['mean_req_memory'] = queued_jobs['req_memory'].mean()
        row['max_req_memory'] = queued_jobs['req_memory'].max()
        row['min_req_memory'] = queued_jobs['req_memory'].min()

        # Calculate features for recently completed jobs
        recent_jobs = df[(df['submit_time'] + df['wait_time'] + df['run_time'] >= submit_time - recent_interval) &
                         (df['submit_time'] + df['wait_time'] + df['run_time'] <= submit_time)]
        row['completed_jobs_count'] = len(recent_jobs)
        row['mean_wait_time_completed'] = recent_jobs['wait_time'].mean()
        row['max_wait_time_completed'] = recent_jobs['wait_time'].max()
        row['min_wait_time_completed'] = recent_jobs['wait_time'].min()
        row['mean_run_time_completed'] = recent_jobs['run_time'].mean()
        row['max_run_time_completed'] = recent_jobs['run_time'].max()
        row['min_run_time_completed'] = recent_jobs['run_time'].min()
        row['mean_procs_completed'] = recent_jobs['num_procs'].mean()
        row['max_procs_completed'] = recent_jobs['num_procs'].max()
        row['min_procs_completed'] = recent_jobs['num_procs'].min()
        row['mean_memory_completed'] = recent_jobs['used_memory'].mean()
        row['max_memory_completed'] = recent_jobs['used_memory'].max()
        row['min_memory_completed'] = recent_jobs['used_memory'].min()

        if pd.isna(row['req_procs']):
            print('hello1')
            row['req_procs'] = queued_jobs['req_procs'].median()
        
        if pd.isna(row['req_memory']):
            print('hello1')
            row['req_mem'] = queued_jobs['req_memory'].median()
        
        if pd.isna(row['req_time']):
            print('hello3')
            row['req_time'] = queued_jobs['req_time'].median()
        
        return row

    df = df.progress_apply(lambda row: get_queued_jobs_features(row, df), axis=1)
    
    return df

In [14]:
# # Смотри самую популярную очередь
# df['queue_id'].value_counts()

In [15]:
df = read_swf_file(file_path)

# Смотрим только одну самую популярную очередь (one partition in Slurm)
# Но какую очередь проверять для конкретного файла, надо смотреть по данным
df = df[df['queue_id'] == 1] 


df.replace(-1, np.nan, inplace=True)
df_with_features = extract_features(df)
dataset = df_with_features[dataset_features]

df_with_features.to_csv('../prepared_data/full_' + filename + '.csv')
dataset.to_csv('../prepared_data/for_using_' + filename + '.csv')

  0%|          | 112/85329 [00:06<1:23:18, 17.05it/s]


KeyboardInterrupt: 

In [26]:
feature_df = pd.read_csv('../prepared_data/for_using_' + filename + '.csv')[:20000]

In [33]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Assuming feature_df is the DataFrame with all the extracted features
X = feature_df.drop(columns=['wait_time'])  # Replace 'target_column' with the name of the column containing target values
y = feature_df['wait_time']

# Split the data without shuffling
train_size = int(0.7 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# X_train, X_test, y_train, y_test = 

# Train the CatBoost model
model = CatBoostRegressor(iterations=5000, learning_rate=0.005, depth=3, loss_function='MAE')
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=100, plot=True)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean squared error: {mae}")


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 569.9961738	test: 844.0431485	best: 844.0431485 (0)	total: 11ms	remaining: 55.2s
100:	learn: 567.1811916	test: 839.9722987	best: 839.9722987 (100)	total: 994ms	remaining: 48.2s
200:	learn: 563.3283319	test: 834.6865106	best: 834.6865106 (200)	total: 1.82s	remaining: 43.6s
300:	learn: 550.9246142	test: 816.1234684	best: 816.1234684 (300)	total: 2.66s	remaining: 41.5s
400:	learn: 512.2836958	test: 751.1454350	best: 751.1454350 (400)	total: 3.47s	remaining: 39.8s
500:	learn: 481.8500407	test: 699.2724857	best: 699.2724857 (500)	total: 4.3s	remaining: 38.7s
600:	learn: 437.4904170	test: 624.8926707	best: 624.8926707 (600)	total: 5.14s	remaining: 37.6s
700:	learn: 404.1685677	test: 570.5092295	best: 570.5092295 (700)	total: 5.98s	remaining: 36.7s
800:	learn: 388.7886695	test: 543.9973672	best: 543.9973672 (800)	total: 6.8s	remaining: 35.6s
900:	learn: 360.7006416	test: 499.2207554	best: 499.2207554 (900)	total: 7.64s	remaining: 34.8s
1000:	learn: 344.8871656	test: 478.5373591	best

KeyboardInterrupt: 

In [None]:
4159 * 4159

17297281

In [None]:
feature_df[44299:44300]

Unnamed: 0,wait_time,req_procs,req_memory,req_time,active_jobs_count,total_used_procs,mean_used_procs,total_used_memory,mean_used_memory,queued_jobs_count,...,min_wait_time_completed,mean_run_time_completed,max_run_time_completed,min_run_time_completed,mean_procs_completed,max_procs_completed,min_procs_completed,mean_memory_completed,max_memory_completed,min_memory_completed
44299,2.0,64.0,4800.0,2700.0,9.0,1344.0,149.333333,57536.0,6392.888889,11.0,...,0.0,2715.495338,197453.0,0.0,104.291663,1024.0,32.0,4371.287509,29076.0,0.0
