In [2]:
!pip install pyspark
!pip install findspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=72e06ca0a0b7280fea53f5a6452d503e24e006fdcea44d61a20004acdd1db1e4
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
import findspark
import numpy as np
from pyspark.sql.types import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from math import sqrt
findspark.init()
import random
import math
import time
import matplotlib.pyplot as plt
import requests
import zipfile
import io
import os
import time
import pandas as pd
from io import StringIO
from IPython.display import clear_output

In [4]:
# Configurar Spark
conf = SparkConf().setAppName("FinOps").setMaster("local[*]")
sc = SparkContext(conf=conf)

# Crear SparkSession
spark = SparkSession.builder.appName("FinOps").getOrCreate()

# Funciones Aux

In [5]:
#funcion auxiliar
def convertir_float(x):
    array = []
    for y in x:
        try:
            array.append(float(y))
        except ValueError:
            array.append(y)
    if array:
        array[-1] = int(array[-1])
    return array


In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def get_y_hat(features, w):
    # Assuming features is an array of features, w is a matrix of shape (1, 11)
    return np.dot(features, w.T)  # Ensure w.T if w is (1, 11) and features is compatible

def grad_cost(label, features, y_hat):
    # Calculate gradient of cost function w.r.t. weights
    return (y_hat - label) * features


def get_derivatives(row, weights):
    features = row[:-1]
    y = row[-1]
    y_hat = get_y_hat(features, weights)
    dJ_dw = (y_hat - y) * np.append(features, 1)
    return dJ_dw

def update_ws(weights, dw, learning_rate):
    return weights - learning_rate * dw


def fcost(y, y_hat):
    #print ("cost:",y,y_hat)
    # compute loss/cost for one element "y_hat" and one label "y"
    epsilon=0.00000001
    if y == 1:
        return -np.log(y_hat if y_hat > 0. else epsilon)
    else:
        return -np.log (1-y_hat if 1-y_hat >0. else epsilon)

# Funciones

In [7]:
def RDD_df(rdd,schema):
    """
    Muestra las primeras filas del DataFrame.

    :param df: El DataFrame a visualizar
    """

    # Convertir el RDD en DataFrame
    df = spark.createDataFrame(rdd, schema=schema)

    return df

In [8]:
def readFile(file_path):
    """
    Lee un archivo CSV y devuelve un DataFrame de PySpark.

    :param file_path: Ruta al archivo CSV
    :return: DataFrame de PySpark
    """
    # Leer el archivo CSV como un RDD de texto
    # Leer el archivo CSV como un RDD de texto
    rdd = sc.textFile(file_path)

    # Extraer el encabezado (primera fila)
    header = rdd.first()

    # Filtrar para excluir el encabezado y conservar solo los datos
    data_rdd = rdd.filter(lambda line: line != header).map(lambda x: x.split(",")).map(convertir_float).map(lambda x: (x[0:11],x[-1]))

    #rdd = sc.textFile(file_path)
    return data_rdd

In [9]:
def normalize(rdd):
    # Convert RDD to DataFrame with the correct structure
    df = rdd.map(lambda x: Row(features=Vectors.dense(x[0]), label=x[1])).toDF(["features", "label"])

    # Use MinMaxScaler for normalization
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(df)
    scaledData = scalerModel.transform(df)

    # Convert the DataFrame back to an RDD
    normalized_rdd = scaledData.select("scaledFeatures", "label").rdd.map(lambda row: (row.scaledFeatures.toArray().tolist(), row.label))

    return normalized_rdd

In [10]:
def train(RDD_Xy, iterations, learning_rate, lambda_reg):
    # Initialize weight vector
    w = np.random.randn(1, 11)

    for i in range(iterations):
                # Compute y_hat using map operation
        rdd_y_hat = RDD_Xy.map(lambda x: (x[0], x[1], get_y_hat(x[0], w)))  # x[0] is features, x[1] is label

        # Calculate cost and regularization
        reg_term = lambda_reg * np.sum(w[:-1] ** 2)  # Regularization term

        rdd_fcost = rdd_y_hat.map(lambda x: fcost(x[1], x[2]))  # Cost function value for each sample
        J = rdd_fcost.reduce(lambda x, y: x + y)  # Sum up cost function across RDD
        J += reg_term  # Add regularization term to total cost
        J = J[0]

        # Update weights
        grad = rdd_y_hat.map(lambda x: grad_cost(x[1], x[0], x[2]))  # x[1] is label, x[0] is features, x[2] is y_hat
        grad_sum = grad.reduce(lambda x, y: x + y)  # Sum up gradients across RDD

        w = w - learning_rate * grad_sum - reg_term  # Update weight vector
        print(f"Iteration {i}  Cost:  {J}")

    return w

In [11]:
def predict(w,x):
    threshold = 0.5
    y_hat = get_y_hat(x,w)
    return 1 if y_hat > threshold else 0

def accuracy(RDD_Xy,w):
    correct_answer= 0

    def count_correct(Xy):
        x,y = Xy
        return 1 if predict(w,x) == y else 0
    correct_answers = RDD_Xy.map(count_correct).reduce(lambda x,y: x+y)
    return correct_answers*100/RDD_Xy.count()

# Ejecucion lectura datos


In [12]:
# Medir el tiempo de inicio
start_time = time.time()

In [None]:
col_names = [
    'pkSeqID', 'stime', 'flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport',
    'pkts', 'bytes', 'state', 'ltime', 'seq', 'dur', 'mean', 'stddev',
    'smac', 'dmac', 'sum', 'min', 'max', 'soui', 'doui', 'sco', 'dco',
    'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate',
    'attack', 'category', 'subcategory'
]

# Definir los tipos de datos correspondientes a cada columna
col_types = {
    'pkSeqID': int, 'stime': float, 'flgs': str, 'proto': str,
    'saddr': str, 'sport': float, 'daddr': str, 'dport': float, 'pkts': int, 'bytes': int,
    'state': str,'ltime': float, 'seq': int, 'dur': float, 'mean': float, 'stddev': float, 'smac': str,
    'dmac': str, 'sum': float, 'min': float, 'max': float, 'soui': float, 'doui': float,
    'sco': float, 'dco': str, 'spkts': str, 'dpkts': str, 'sbytes': str, 'dbytes': str,
    'rate': str, 'srate': str, 'drate': str, 'attack': str, 'category': str, 'subcategory': str
}

# Definir las URLs de los archivos CSV
url_base = 'https://raw.githubusercontent.com/Meusz/FinOps/main/data/data_'
urls = [url_base + str(i) + '.csv' for i in range(1, 19)]

# Inicializar un DataFrame vacío
df_combinado = pd.DataFrame(columns=col_names)
# Convertir tipos de columnas según el diccionario col_types
df_combinado = df_combinado.astype(col_types)


# Descargar y combinar los archivos CSV en un DataFrame

for url in urls:
    clear_output()
    print(f"Ultimo URL leido:{url}")

    df = pd.read_csv(url,names=col_names,header=0)
    # Convertir 'sport' y 'dport' a tipo numérico, ignorando los errores
    df['sport'] = pd.to_numeric(df['sport'], errors='coerce')
    df['dport'] = pd.to_numeric(df['dport'], errors='coerce')

    # Llenar NaN en las columnas con un valor predeterminado, por ejemplo 0
    df['pkts'].fillna(0, inplace=True)
    df['bytes'].fillna(0, inplace=True)
    df['seq'].fillna(0, inplace=True)

    # Convertir las columnas a tipo int después de manejar NaN
    df['pkts'] = df['pkts'].astype(int)
    df['bytes'] = df['bytes'].astype(int)
    df['seq'] = df['seq'].astype(int)

    df=df.astype(col_types)
    # Combinar los DataFrames
    df_combinado = pd.concat([df_combinado, df])
    del df


# Mostrar el DataFrame combinado
clear_output()

df_combinado.drop(df_combinado[df_combinado['category'] == 'nan'].index, inplace=True)

#["flgs", "proto", "pkts", "bytes", "dur", "mean", "stddev", "sum", "min", "max", "rate", "category"]

df_combinado.loc[df_combinado["proto"] == "tcp", "proto"] = 0
df_combinado.loc[df_combinado["proto"] == "udp", "proto"] = 1
df_combinado.loc[df_combinado["proto"] == "icmp", "proto"] = 2
df_combinado.loc[df_combinado["proto"] == "arp", "proto"] = 3
df_combinado.loc[df_combinado["proto"] == "ipv6-icmp", "proto"] = 4
df_combinado.loc[df_combinado["proto"] == "igmp", "proto"] = 4
df_combinado.loc[df_combinado["proto"] == "rarp", "proto"] = 4



df_combinado.loc[df_combinado["category"] == "Reconnaissance", "category"] = 0
df_combinado.loc[df_combinado["category"] == "DoS", "category"] = 1
df_combinado.loc[df_combinado["category"] == "Normal", "category"] = 2
df_combinado.loc[df_combinado["category"] == "Theft", "category"] = 3
df_combinado.loc[df_combinado["category"] == "Reconnai", "category"] = 4

df_combinado['category'] = df_combinado['category'].astype(int)
df_combinado['proto'] = df_combinado['proto'].astype(int)




df_combinado = df_combinado.dropna(subset=["flgs", "proto", "pkts", "bytes", "dur", "mean", "stddev", "sum", "min", "max", "rate", "category"])
#df_combinado.drop(df_combinado[df_combinado['daddr'] == 'nan'].index, inplace=True)

df_combinado

Ultimo URL leido:https://raw.githubusercontent.com/Meusz/FinOps/main/data/data_14.csv


In [None]:
df_combinado['proto'] = df_combinado['proto'].astype(int)

In [None]:
# Se eliminan las columnas innecesarias del DataFrame
df_combinado=df_combinado.drop(columns = ['pkSeqID', 'stime', 'flgs', 'ltime', 'seq', 'smac',  'dmac', 'soui', 'doui', 'sco', 'dco', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'srate', 'drate', 'attack', 'subcategory'])

# Selecciona las columnas de tipo 'object' en el DataFrame  y devuelve sus nombres
print(df_combinado.select_dtypes(include=['object']).columns)

# Calcula la cantidad de valores NaN por columna en el DataFrame
print(df_combinado.isna().sum())


# Elimina las filas donde la columna 'sport' tiene valores NaN en el DataFrame

df_combinado = df_combinado.dropna(subset=['sport','proto'])

# Elimina las filas duplicadas
df_combinado.drop_duplicates(inplace = True)

# Elimina las columnas especificadas del DataFrame
df_combinado = df_combinado.drop(columns = ['saddr', 'daddr',  'state', 'sport', 'dport'])


# Guardar el DataFrame df_combinado en un archivo CSV
df_combinado.to_csv('botnet.csv', index=False)
print(df_combinado.head())
del df_combinado

In [None]:
# Extraer el archivo CSV del ZIP y cargarlo en un DataFrame
path = 'botnet.csv'
nIter = 5
learningRate = 0.1
lambda_reg = 0.1

In [None]:
# Medir el tiempo de finalización
end_time = time.time()
# Calcular y mostrar el tiempo de ejecución
execution_time = end_time - start_time
print(f'Tiempo de ejecución: {execution_time:.2f} segundos, {execution_time/60:.2f}  minutos')

# Ejecucion entrenamiento

In [None]:
# Medir el tiempo de inicio
start_time = time.time()

In [None]:
# Convertir el DataFrame de Spark a un RDD
data = readFile(path)
print(data.take(3))

In [None]:
# Normalize the numeric RDD
data_normalized =normalize(data)
print(data_normalized.take(3))

In [None]:
# Entrenar el modelo con RDDs
ws = train(data_normalized, nIter, learningRate, lambda_reg)

Iteration 0  Cost:  339395.72780475754
Iteration 1  Cost:  18242505.18245464
Iteration 2  Cost:  -10858065.28211775
Iteration 3  Cost:  29135653.70775554
Iteration 4  Cost:  -37223945.96115874


In [None]:
# Calcular la precisión
acc = accuracy(data_normalized, ws)

print("Accuracy:", acc)

Accuracy: 9.472300880006914


In [None]:
# Medir el tiempo de finalización
end_time = time.time()
# Calcular y mostrar el tiempo de ejecución
execution_time = end_time - start_time
print(f'Tiempo de ejecución: {execution_time:.2f} segundos, {execution_time/60:.2f}  minutos')

# Cross-Validation

## Funcion

In [None]:
def transforma(RDD_Xy):
    import random
    #RDD_Xy_con_indice = RDD_Xy.zipWithIndex()
    #RDD_Xy_con_clave = RDD_Xy_con_indice.map(lambda x: (x[0],(x[1]%num_block_cv)))
    """Si zipWithIndex no esta permitido, quiza podemos utilizar randint(0, num_bloques-1) para
    generar claves, aun que la proporcion de tamaño de tran y test en este caso no es determinado"""
    RDD_Xy_con_indice = RDD_Xy.map(lambda x: (x,random.randint(0, num_block_cv-1)))
    RDD_Xy_con_clave = RDD_Xy_con_indice.map(lambda x: (x[0],(x[1]%num_block_cv)))

    return RDD_Xy_con_clave

In [None]:
def get_block_data(RDD_Xy, clave):
    train = RDD_Xy.flatMap(lambda x: [x] if x[1] != clave else []).map(lambda x: x[0])
    test = RDD_Xy.flatMap(lambda x: [x] if x[1] == clave else []).map(lambda x: x[0])

    return train, test

## Ejecucion

In [None]:
# Medir el tiempo de inicio
start_time = time.time()

In [None]:
#definir cuantas bloques quiere dividir para cross-validation
num_block_cv = 5
avg_acc = 0

In [None]:
#Ya tenemos data_normalized
data_cv = transforma(data_normalized)

In [None]:
for i in range(num_block_cv):
    print(f"Cross-Validation con clave:{i}")
    train_data,test = get_block_data(data_cv,i)
    ws = train(train_data,nIter,learningRate,lambda_reg)
    acc = accuracy(data_normalized,ws)
    avg_acc += avg_acc
    print("acc:" , acc)
print("averge acc:" , avg_acc)

Cross-Validation con clave:0
Iteration 0  Cost:  152712.30279158414
Iteration 1  Cost:  13309409.136283476
Iteration 2  Cost:  -7875536.918648434
Iteration 3  Cost:  23433739.896141876
Iteration 4  Cost:  -28217543.288577415
acc: 9.472300880006914
Cross-Validation con clave:1
Iteration 0  Cost:  2487044.061113406
Iteration 1  Cost:  15187045.153861243
Iteration 2  Cost:  -9215297.435354795
Iteration 3  Cost:  23319653.780725002
Iteration 4  Cost:  -29529428.13384171
acc: 9.472300880006914
Cross-Validation con clave:2
Iteration 0  Cost:  2507412.7840857347
Iteration 1  Cost:  13339787.433115056
Iteration 2  Cost:  -6021336.06666621
Iteration 3  Cost:  23335591.976482373
Iteration 4  Cost:  -29217334.086343374
acc: 9.472300880006914
Cross-Validation con clave:3
Iteration 0  Cost:  2473424.5946685043
Iteration 1  Cost:  13942505.406318696
Iteration 2  Cost:  -8984413.951583195
Iteration 3  Cost:  23291020.56964372
Iteration 4  Cost:  -29585499.603707425
acc: 9.472300880006914
Cross-Valida

In [None]:
# Medir el tiempo de finalización
end_time = time.time()
# Calcular y mostrar el tiempo de ejecución
execution_time = end_time - start_time
print(f'Tiempo de ejecución: {execution_time:.2f} segundos, {execution_time/60:.2f}  minutos')

Tiempo de ejecución: 6614.83 segundos, 110.25  minutos


# Graficos para el informe

## funcion

In [None]:
def train_visualizacion(RDD_Xy, iterations, learning_rate, lambda_reg):
    # Initialize weight vector
    m = RDD_Xy.count()  # Number of samples in RDD
    num_columns = len(RDD_Xy.take(1)[0][0])  # Number of columns in features + 1 (for bias)
    w = np.random.randn(1, 11)
    costos_entrenamiento = []  # List to store training costs (objective function + regularization)
    tiempos = []              # List to store training times
    for i in range(iterations):
        start_time = time.time()  # Start time of iteration
        # Compute y_hat using map operation
        rdd_y_hat = RDD_Xy.map(lambda x: (x[0], x[1], get_y_hat(x[0], w)))  # x[0] is features, x[1] is label

        # Calculate cost and regularization
        reg_term = lambda_reg * np.sum(w[:-1] ** 2)  # Regularization term

        rdd_fcost = rdd_y_hat.map(lambda x: fcost(x[1], x[2]))  # Cost function value for each sample
        J = rdd_fcost.reduce(lambda x, y: x + y)  # Sum up cost function across RDD
        J += reg_term  # Add regularization term to total cost
        J = J[0]
        # Append current iteration's cost to list
        costos_entrenamiento.append(J)

        # Update weights
        grad = rdd_y_hat.map(lambda x: grad_cost(x[1], x[0], x[2]))  # x[1] is label, x[0] is features, x[2] is y_hat
        grad_sum = grad.reduce(lambda x, y: x + y)  # Sum up gradients across RDD

        w = w - learning_rate * grad_sum - reg_term  # Update weight vector
        print(f"Iteration {i}  Cost:  {J}")


        end_time = time.time()  # End time of iteration
        tiempo = end_time - start_time  # Duration of iteration
        tiempos.append(tiempo)  # Append duration to list
    return w, tiempos, costos_entrenamiento

## Ejecucion

In [None]:
# Medir el tiempo de inicio
start_time = time.time()

In [None]:
#Test

ws,tiempos, costos_entrenamiento = train_visualizacion(data_normalized,nIter, learningRate,lambda_reg)
acc = accuracy(data_normalized,ws)

print("acc:" , acc)

In [None]:

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(costos_entrenamiento, label='Costo por Iteración')
plt.xlabel('Iteración')
plt.ylabel('Costo de Entrenamiento')
plt.title('Costo de Entrenamiento por Iteración')
plt.legend()


plt.subplot(1, 2, 2)
plt.plot(tiempos, label='Tiempo por Iteración')
plt.xlabel('Iteración')
plt.ylabel('Tiempo (s)')
plt.title('Tiempo de Ejecución por Iteración')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:

avg_acc = 0
precisiones = []
num_registros_train = []
num_registros_test = []

for i in range(num_block_cv):
    print(f"Cross-Validation con clave:{i}")
    train_data, test = get_block_data(data_cv, i)
    num_registros_train.append(train_data.count())
    num_registros_test.append(test.count())
    ws = train(train_data, nIter, learningRate, lambda_reg)
    acc = accuracy(test, ws)
    precisiones.append(acc)
    avg_acc += acc
    print("Número de registros en train:", num_registros_train[-1])
    print("Número de registros en test:", num_registros_test[-1])
    print("acc:", acc)

avg_acc /= num_block_cv
print("Average accuracy:", avg_acc)


In [None]:
num_registros_train

In [None]:
num_registros_test

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax1 = plt.subplots(figsize=(12, 6))

color = 'tab:red'
ax1.set_xlabel('Bloque de CV')
ax1.set_ylabel('Precisión', color=color)
ax1.plot(precisiones, label='Precisión por bloque de CV', color=color, marker='o')
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_xticks(np.arange(num_block_cv))

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Número de registros', color=color)
ax2.plot(num_registros_train, label='Registros en Train', color='blue', marker='x', linestyle='--')
ax2.plot(num_registros_test, label='Registros en Test', color='cyan', marker='x', linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
fig.legend(loc="upper right", bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
plt.title('Precisión y Número de Registros por Bloque de CV')
plt.show()


In [None]:
num_workers_list = [1, 2, 3, 4,5,6,7,8,9]
tiempos_por_worker = []
speedup_por_worker = []

for num_workers in num_workers_list:
    costos_entrenamiento = []
    sc.stop()
    sc = SparkContext(f"local[{num_workers}]", f"Test_{num_workers}_workers")

    init_time = time.time()
    #data = readFile(path)
    #data = normalize(data)
    #ws,tiempos, costos_entrenamiento = train_visualizacion(data_normalized,nIter, learningRate,lambda_reg)
    #acc = accuracy(data_normalized, ws)
    fin_time = time.time()
    plt.xlabel("Iteración")
    plt.ylabel("Costo")
    plt.title(f"Costo de entrenamiento con {num_workers}")
    plt.legend(num_workers_list, loc="best")
    plt.plot(costos_entrenamiento, color=f"C{num_workers}")
    plt.grid(True)
    plt.show()
    tiempo_total = fin_time - init_time
    tiempos_por_worker.append(tiempo_total)

    print(f"Workers: {num_workers}, Acc: {acc}, Tiempo: {tiempo_total}")

    sc.stop()

tiempo_con_1_worker = tiempos_por_worker[0]
speedup_por_worker = [tiempo_con_1_worker / tiempo for tiempo in tiempos_por_worker]

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(num_workers_list, tiempos_por_worker, marker='o', linestyle='-', color='b')
plt.xlabel('Número de Workers')
plt.ylabel('Tiempo de ejecución (s)')
plt.title('Curva de Rendimiento')

plt.subplot(1, 2, 2)
plt.plot(num_workers_list, speedup_por_worker, marker='o', linestyle='-', color='r')
plt.xlabel('Número de Workers')
plt.ylabel('Speedup')
plt.title('Curva de Speedup')

plt.tight_layout()
plt.show()

In [None]:
# Medir el tiempo de finalización
end_time = time.time()
# Calcular y mostrar el tiempo de ejecución
execution_time = end_time - start_time
print(f'Tiempo de ejecución: {execution_time:.2f} segundos, {execution_time/60:.2f}  minutos')

# Analizar componentes

In [None]:
import psutil
import subprocess

# Obtener información del procesador
cpu_info = os.popen("cat /proc/cpuinfo | grep 'model name' | uniq").read().strip()
print(f'Modelo de procesador: {cpu_info}')

# Número de procesadores físicos
num_processors = psutil.cpu_count(logical=False)
print(f'Número de procesadores físicos: {num_processors}')

# Número de vCores
num_vcores = psutil.cpu_count(logical=True)
print(f'Número de vCores (procesadores lógicos): {num_vcores}')

# Capacidad de memoria
mem = psutil.virtual_memory()
total_memory_gb = mem.total / (1024 ** 3)  # Convertir bytes a GB
available_memory_gb = mem.available / (1024 ** 3)  # Convertir bytes a GB
print(f'Capacidad total de memoria RAM: {total_memory_gb:.2f} GB')
print(f'Memoria RAM disponible: {available_memory_gb:.2f} GB')

# Información del disco duro
disk_usage = psutil.disk_usage('/')
total_disk_gb = disk_usage.total / (1024 ** 3)  # Convertir bytes a GB
used_disk_gb = disk_usage.used / (1024 ** 3)    # Convertir bytes a GB
free_disk_gb = disk_usage.free / (1024 ** 3)    # Convertir bytes a GB
print(f'Capacidad total del disco duro: {total_disk_gb:.2f} GB')
print(f'Espacio utilizado del disco duro: {used_disk_gb:.2f} GB')
print(f'Espacio libre del disco duro: {free_disk_gb:.2f} GB')

# Tipo de disco duro
disk_info = os.popen("lsblk -o NAME,ROTA,TYPE,SIZE | grep '^sda'").read().strip()
print(f'Tipo de disco duro: {disk_info}')

# Información del nodo
node_info = os.uname()
print(f'Información del nodo: {node_info}')

# Información detallada del sistema
print(f'Información detallada del sistema:')
print(f'Sistema: {node_info.sysname}')
print(f'Nombre del nodo: {node_info.nodename}')
print(f'Release: {node_info.release}')
print(f'Versión: {node_info.version}')
print(f'Máquina: {node_info.machine}')

# Obtener información de la GPU
try:
    gpu_info = subprocess.check_output("nvidia-smi --query-gpu=name --format=csv,noheader", shell=True).decode('utf-8').strip()
    print(f'Modelo de GPU: {gpu_info}')
except subprocess.CalledProcessError:
    print('No se detectó GPU o NVIDIA-SMI no está instalado.')
