In [5]:
import pandas as pd

def sample_instances_per_class(dataframe, class_column, num_instances=50):
    """
    Samples a specified number of instances for each class in a dataset.

    :param dataframe: pandas DataFrame, the input dataset.
    :param class_column: str, the column name containing class labels.
    :param num_instances: int, the number of instances to sample per class.
    :return: pandas DataFrame containing the sampled instances.
    """
    sampled_data = dataframe.groupby(class_column).apply(
        lambda group: group.sample(n=num_instances, random_state=42, replace=len(group) < num_instances)
    ).reset_index(drop=True)
    
    return sampled_data

# Example usage:
# Load your dataset
data = pd.read_csv("balanced_random_smoteenn.csv")

# Column name containing class labels
class_column = "app"  # Replace with the name of your class column

# Sample 50 instances per class
sampled_data = sample_instances_per_class(data, class_column, num_instances=65)

# Save the sampled data to a new CSV file if needed
sampled_data.to_csv("test_dataset.csv", index=False)

print(f"Sampled dataset created with {len(sampled_data)} rows.")


Sampled dataset created with 910 rows.


  sampled_data = dataframe.groupby(class_column).apply(


In [6]:
import pandas as pd

df=pd.read_csv('Processed_wild.csv')

valid_classes = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]

df_filtered = df[df['app'].isin(valid_classes)]

df_sampled = df_filtered.sample(n=100, random_state=42) 

df_700 = pd.read_csv("test_dataset.csv") 

df_combined = pd.concat([df_sampled, df_700], ignore_index=True)

df_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

df_shuffled.to_csv("test_dataset.csv", index=False)
