In [1]:
import pandas as pd

# Define column names for our dataset
column_names = ['user_id', 'song_id', 'play_count']

# Load a subset of the data (e.g., the first 1,000,000 rows)
# The file is tab-separated, so we use sep='\t'
df = pd.read_csv('train_triplets.txt',
                 sep='\t',
                 header=None,
                 names=column_names,
                 nrows=1000000)

print("Successfully loaded the data!")

print("\n------------------------------------\n")

# See the first 5 rows
print("First 5 rows of the data:")
print(df.head())

print("\n------------------------------------\n")

# Get a summary of the data (data types, non-null values)
print("Data summary:")
df.info()

print("\n------------------------------------\n")

# Find out how many unique users and songs are in our subset
unique_users = df['user_id'].nunique()
unique_songs = df['song_id'].nunique()

print(f"Number of unique users: {unique_users}")
print(f"Number of unique songs: {unique_songs}")

print("\n------------------------------------\n")

# --- Preparing (Cleaning) the Data ---

# 1. Count how many times each song has been played
song_play_counts = df.groupby('song_id')['play_count'].count()

# 2. Filter for songs that have been played at least 50 times
popular_songs = song_play_counts[song_play_counts >= 50].index
df_popular = df[df['song_id'].isin(popular_songs)]

# 3. Count how many songs each user has played
user_play_counts = df_popular.groupby('user_id')['play_count'].count()

# 4. Filter for users who have played at least 20 songs
active_users = user_play_counts[user_play_counts >= 20].index
df_final = df_popular[df_popular['user_id'].isin(active_users)]

print("Original data shape:", df.shape)
print("Shape after filtering for popular songs and active users:", df_final.shape)

Successfully loaded the data!

------------------------------------

First 5 rows of the data:
                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1

------------------------------------

Data summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000000 non-null  object
 1   song_id     1000000 non-null  object
 2   play_count  1000000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 22.9+ MB

-----------------------

In [2]:
import sys
!{sys.executable} -m pip install --upgrade pip setuptools wheel

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Collecting setuptools
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.5/1.8 MB 2.8 MB/s eta 0:00:01
   ----------- ---------------------------- 0.5/1.8 MB 2.8 MB/s eta 0:00:01
   ----------------- ---------------------- 0.8/1.8 MB 1.5 MB/s eta 0:00:01
   ----------------------------- ---------- 1.3/1.8 MB 1.4 MB/s eta 0:00:01
   ----------------------------------- ---- 1.6/1.8 MB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 1.3 MB/s eta 0:00:00
Using cached setuptools-80.9.0-py3-none-any.whl (1.2 MB)
Installing collected packages: setuptools, pip

  Attempting uninstall: setuptools

    Found existing installation: setuptools 72.1.0

   

In [3]:
import sys
!{sys.executable} -m pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [45 lines of output]
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
          self.avg_cltr_i = avg_cltr_i
          self.avg_cocltr = avg_cocltr
  
          return self
  
      def compute_averages(self, np.ndarray[np.int_t] cltr_u,
                                               ^
  ------------------------------------------------------------
  
  surprise\prediction_algorithms\co_clustering.pyx:157:45: Invalid type.
  Compiling surprise/similarities.pyx because it changed.
  Compiling surprise/prediction_algorithms/matrix_factorization.pyx because it changed.
  Compiling surprise/prediction_algorithms/optimize_baselines.pyx because it changed.
  Compiling surprise/prediction_algorithms/slope_one.pyx because it changed.
  Compiling surprise/prediction_algorithms/co_clustering.pyx because it changed.
  [1/