**<h2> Project </h2>**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [3]:
file_path = '/content/drive/My Drive/data_ai/game_data/data_score.csv'
df = pd.read_csv(file_path)

**<h2> Preprocessing </h2>**

In [4]:
df.sort_values(by='force', ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,ID,force,song1,song2,song3,song4,song5,song6,song7,song8,...,song91,song92,song93,song94,song95,song96,song97,song98,song99,song100
0,SV-2505-3189,21.297,9966614,10000000,9965229,9928247,9970389,9975388,9938271,9934014,...,9939531,9985875,9934733,9966666,9944386,9912729,9952326,9938206,9946492,9979088
1,SV-6555-7968,21.247,9935192,10000000,9979137,9962235,9974619,9935630,9963348,9978714,...,9933026,9738468,9884653,9784580,9781073,9967366,9960000,9965241,9965392,9918001
2,SV-6546-7094,21.234,9978397,9997039,9977399,9977341,9955583,9924271,9934413,9985100,...,9933026,9933428,9917251,9841269,9976459,9930476,9993333,9991310,9902196,9965675
3,SV-2165-8973,21.231,9980361,9979277,9954798,9960347,9945008,9969708,9965277,9980842,...,9953955,9888254,9979939,9969765,9981167,9938989,9953333,9980882,9966897,9973302
4,SV-3164-3276,21.229,9929300,9997039,9970445,9975453,9976734,9918591,9978780,9980842,...,9943491,9902520,9989969,9920634,9976459,9951759,9953333,9960027,9968402,9925629


In [6]:
close_boundary = 10
song_columns = [col for col in df.columns if col.startswith('song')]

In [9]:
for col in song_columns:
    for idx in range(len(df)):
        if df.at[idx, col] != 0:
            continue

        neighbors = []
        for i in range(0, close_boundary):
            if idx - i >= 0:
                neighbors.append(df.at[idx - i, col])
            if idx + i < len(df):
                neighbors.append(df.at[idx + i, col])

        neighbors = [val for val in neighbors if val != 0]
        if neighbors:
            df.at[idx, col] = int(np.mean(neighbors))

# Replace zero with mean

In [10]:
row_means = df[song_columns].mean(axis=1)
row_stds = df[song_columns].std(axis=1)
df[song_columns] = df[song_columns].sub(row_means, axis=0).div(row_stds, axis=0)
df[song_columns] = df[song_columns].clip(lower=-3)

# Standardization

In [11]:
df.head()

Unnamed: 0,ID,force,song1,song2,song3,song4,song5,song6,song7,song8,...,song91,song92,song93,song94,song95,song96,song97,song98,song99,song100
0,SV-2505-3189,21.297,0.645292,1.79376,0.597648,-0.674522,0.77515,0.947115,-0.3297,-0.476139,...,-0.286356,1.307864,-0.451406,0.64708,-0.119345,-1.208337,0.153788,-0.331936,-0.0469,1.074394
1,SV-6555-7968,21.247,-0.254818,0.998972,0.595352,0.268362,0.507945,-0.246344,0.289894,0.587168,...,-0.296722,-3.0,-1.232556,-3.0,-3.0,0.367627,0.225123,0.326517,0.329438,-0.587398
2,SV-6546-7094,21.234,0.32498,0.943985,0.291842,0.289916,-0.432556,-1.472267,-1.135503,0.547552,...,-1.181558,-1.16821,-1.705365,-3.0,0.260629,-1.266231,0.820928,0.753755,-2.205264,-0.097452
3,SV-2165-8973,21.231,0.53905,0.501562,-0.345014,-0.153109,-0.683589,0.17063,0.017389,0.555685,...,-0.374168,-2.646355,0.524456,0.172601,0.566925,-0.891749,-0.395679,0.557068,0.073415,0.294924
4,SV-3164-3276,21.229,-1.10973,1.184064,0.283532,0.453114,0.496491,-1.472361,0.565773,0.635597,...,-0.629191,-2.016561,0.944658,-1.40318,0.487179,-0.349218,-0.295919,-0.069245,0.214351,-1.234038


**<h2> Clustering of users </h2>**

In [12]:
X = df[song_columns]
kmeans = KMeans(n_clusters=6)
clusters = kmeans.fit_predict(X)
df = pd.concat([df, pd.Series(clusters, name='cluster')], axis=1)

In [13]:
df.head()

Unnamed: 0,ID,force,song1,song2,song3,song4,song5,song6,song7,song8,...,song92,song93,song94,song95,song96,song97,song98,song99,song100,cluster
0,SV-2505-3189,21.297,0.645292,1.79376,0.597648,-0.674522,0.77515,0.947115,-0.3297,-0.476139,...,1.307864,-0.451406,0.64708,-0.119345,-1.208337,0.153788,-0.331936,-0.0469,1.074394,3
1,SV-6555-7968,21.247,-0.254818,0.998972,0.595352,0.268362,0.507945,-0.246344,0.289894,0.587168,...,-3.0,-1.232556,-3.0,-3.0,0.367627,0.225123,0.326517,0.329438,-0.587398,3
2,SV-6546-7094,21.234,0.32498,0.943985,0.291842,0.289916,-0.432556,-1.472267,-1.135503,0.547552,...,-1.16821,-1.705365,-3.0,0.260629,-1.266231,0.820928,0.753755,-2.205264,-0.097452,1
3,SV-2165-8973,21.231,0.53905,0.501562,-0.345014,-0.153109,-0.683589,0.17063,0.017389,0.555685,...,-2.646355,0.524456,0.172601,0.566925,-0.891749,-0.395679,0.557068,0.073415,0.294924,3
4,SV-3164-3276,21.229,-1.10973,1.184064,0.283532,0.453114,0.496491,-1.472361,0.565773,0.635597,...,-2.016561,0.944658,-1.40318,0.487179,-0.349218,-0.295919,-0.069245,0.214351,-1.234038,4


**<h2> Song difficulty table </h2>**

In [14]:
cluster_scores = df.groupby('cluster')[song_columns].mean()
cluster_scores.head()

Unnamed: 0_level_0,song1,song2,song3,song4,song5,song6,song7,song8,song9,song10,...,song91,song92,song93,song94,song95,song96,song97,song98,song99,song100
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.406081,-0.005305,0.70002,-0.096297,0.13351,0.301145,0.387465,0.517837,0.631783,0.024994,...,-0.36115,-0.457373,-0.322509,-0.386707,-0.292199,-0.233528,-0.350136,-0.332911,-0.282737,-0.426954
1,0.480166,0.570124,0.659561,0.26931,0.255125,-0.026327,0.301643,0.656802,0.646542,0.057596,...,-1.044421,-1.047524,-0.751798,-1.586855,-0.367402,-0.204482,-0.359666,-0.170435,-0.42966,-0.578286
2,0.235773,-1.343425,0.947116,-0.300796,-0.897976,0.134015,0.387627,0.717395,0.633,-0.352967,...,-0.905871,-0.605774,-0.522582,-0.206368,-0.0181,-0.002443,-0.101579,-0.035834,-0.164028,-0.255345
3,0.091659,0.233145,0.609836,-0.010087,-0.301199,0.077942,0.18318,0.621165,0.71226,-0.224084,...,-0.661858,-0.680323,-0.280393,-0.352541,-0.16243,-0.058957,-0.080075,-0.033901,-0.09452,-0.445845
4,-0.086073,-0.635394,0.612341,-0.209334,-0.806089,-0.142576,0.038707,0.600736,0.509618,-0.431146,...,-0.675865,-1.781344,0.093394,-2.331165,-0.315694,0.347021,0.17435,0.375936,0.280724,-0.419124


In [15]:
cluster_scores.to_csv('cluster_scores.csv')

**<h2> Song type table </h2>**

In [16]:
song_ranks = cluster_scores.rank(axis=1, ascending=True)
song_ranks

Unnamed: 0_level_0,song1,song2,song3,song4,song5,song6,song7,song8,song9,song10,...,song91,song92,song93,song94,song95,song96,song97,song98,song99,song100
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,86.0,51.0,94.0,46.0,67.0,77.0,84.0,89.0,91.0,56.0,...,16.0,8.0,22.0,14.0,25.0,31.0,17.0,19.0,27.0,11.0
1,70.0,75.0,82.0,62.0,60.0,46.0,63.0,81.0,80.0,52.0,...,6.0,5.0,15.0,1.0,31.0,40.0,32.0,43.0,27.0,21.0
2,64.0,3.0,89.0,31.0,14.0,60.0,71.0,80.0,75.0,29.0,...,13.0,21.0,25.0,37.0,50.0,52.0,43.0,48.0,38.0,35.0
3,56.0,64.0,79.0,53.0,30.0,54.0,61.0,80.0,88.0,35.0,...,13.0,12.0,32.0,28.0,38.0,46.0,45.0,50.0,41.0,22.0
4,37.0,17.0,76.0,31.0,13.0,33.0,46.0,75.0,72.0,22.0,...,16.0,3.0,47.0,2.0,28.0,66.0,55.0,68.0,63.0,23.0
5,66.0,25.0,92.0,37.0,32.0,64.0,75.0,87.0,88.0,47.0,...,19.0,23.0,38.0,39.0,49.0,44.0,51.0,33.0,24.0,34.0


In [17]:
max_rank_clusters = song_ranks.idxmax(axis=0)
max_rank_clusters

Unnamed: 0,0
song1,0
song2,1
song3,0
song4,1
song5,0
...,...
song96,4
song97,4
song98,4
song99,4


In [18]:
max_rank_clusters.to_csv('song_clusters.csv')