In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Loading the dataset
df = pd.read_csv('NATOPS_sid_TRAIN.csv')




In [None]:
# Standardizing
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(['isTest', 'sid', 'class'], axis=1))

# Task 1: Finding Clusters
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)





In [None]:
# Task 2: Generating Atomic Units Data
ratios = df.groupby('sid')['cluster'].value_counts(normalize=True).unstack(fill_value=0)



In [None]:
# Merging with the original data
final_df = df[['isTest', 'sid', 'class']].drop_duplicates().merge(ratios, on='sid')





In [None]:
# Task 3: Final Output
final_df.to_csv('final_output.csv', index=False)

# Printing first five rows
print(final_df.head())

   isTest  sid  class         0         1         2         3         4
0       0    1      3  0.411765  0.215686  0.137255  0.019608  0.215686
1       0    2      2  0.647059  0.000000  0.000000  0.352941  0.000000
2       0    3      2  0.666667  0.000000  0.000000  0.333333  0.000000
3       0    4      3  0.372549  0.274510  0.137255  0.000000  0.215686
4       0    5      2  0.607843  0.000000  0.000000  0.392157  0.000000


**With 10 Fold cross validation**

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import numpy as np


In [None]:
# Loading the dataset
df = pd.read_csv('NATOPS_sid_TRAIN.csv')

# Standardizing
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(['isTest', 'sid', 'class'], axis=1))

#KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Placeholder for cluster consistency checks
cluster_assignments = []

for train_index, test_index in kf.split(scaled_features):
    # Splitting the dataset
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    train_sids, test_sids = df.iloc[train_index]['sid'], df.iloc[test_index]['sid']

    # Applying KMeans
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans.fit(X_train)

    # Assigning the clusters to test data
    test_clusters = kmeans.predict(X_test)

    # Store cluster assignments with corresponding sids for consistency check
    cluster_assignments.append(pd.DataFrame({'sid': test_sids, 'cluster': test_clusters}))

# Concatenate all test fold cluster assignments
all_test_assignments = pd.concat(cluster_assignments)

# Calculate the ratio of each cluster for each 'sid'
ratios = all_test_assignments.groupby('sid')['cluster'].value_counts(normalize=True).unstack(fill_value=0)

# Merging with the original data
final_df = df[['isTest', 'sid', 'class']].drop_duplicates().merge(ratios, on='sid')





In [None]:
# Saving to CSV
final_df.to_csv('final_output_with_cross_validation.csv', index=False)

# Printing first five rows
print(final_df.head())



   isTest  sid  class         0         1         2         3         4
0       0    1      3  0.274510  0.235294  0.137255  0.117647  0.235294
1       0    2      2  0.450980  0.352941  0.078431  0.078431  0.039216
2       0    3      2  0.372549  0.372549  0.019608  0.117647  0.117647
3       0    4      3  0.156863  0.176471  0.098039  0.235294  0.333333
4       0    5      2  0.392157  0.313725  0.019608  0.196078  0.078431
