In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [28]:
# Read the data
df = pd.read_csv(r'C:\Users\krish\OneDrive\Desktop\Music Recomendation\notebooks\data\music.csv')
df = df.iloc[:100001]

In [49]:
df['ID'] = df.reset_index().index + 1

In [29]:
columns_to_drop = ['duration_ms', 'energy', 'liveness', 'popularity',
                   'explicit', 'speechiness', 'acousticness', 'instrumentalness', 
                   'loudness', 'tempo', 'danceability']
df.drop(columns=columns_to_drop, inplace=True)

In [30]:
df['row_id'] = range(1, len(df) + 1)

In [31]:
ordinal_columns = ['year', 'key', 'mode']

In [32]:
ordinal_transformer = OrdinalEncoder()
scaler = StandardScaler()

numerical_transformer = Pipeline(steps=[
    ('scaler', scaler)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', ordinal_transformer, ordinal_columns)
    ])

In [34]:
X = df.drop(['row_id', 'name'], axis=1)  # Remove row_id and name from the input features
y = df['name']
X_transformed = preprocessor.fit_transform(X)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [36]:
model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
model.fit(X_train)

In [37]:
def find_song_index(song_name, data):
    try:
        index = data[data['name'] == song_name].index[0]
        return index
    except IndexError:
        print(f"Song '{song_name}' not found in the dataset.")
        return None

In [38]:
def recommend_songs(song_name, model, X_train, data):
    song_index = find_song_index(song_name, data)
    if song_index is None:
        return None
    
    # Get the feature vector of the song
    song_vector = X_train[song_index]
    
    # Reshape the vector to be compatible with the model's input shape
    song_vector = song_vector.reshape(1, -1)
    
    # Find the nearest neighbors
    distances, indices = model.kneighbors(song_vector)
    
    # Get the song names of the neighbors
    recommended_songs = data.iloc[indices[0]]['name'].values
    
    return recommended_songs

In [39]:
song_name = 'Danny Boy'
recommended_songs = recommend_songs(song_name, model, X_train, df)
if recommended_songs is not None:
    print(f"Recommended songs for '{song_name}': {recommended_songs}")

Recommended songs for 'Danny Boy': ['Goodbye Girl' 'The Girl From Ipanema' 'Ohio'
 'Melodía de Amor - Remasterizado' 'Danny Boy']


In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import os
import pickle

# Create 'artifacts' directory if it doesn't exist
if not os.path.exists('artifacts'):
    os.makedirs('artifacts')

# Read the data
df = pd.read_csv(r'C:\Users\krish\OneDrive\Desktop\Music Recomendation\notebooks\data\music.csv')
df = df.iloc[:100001]

# Add an ID column
df['ID'] = df.reset_index().index + 1

# Drop specified columns
columns_to_drop = ['duration_ms', 'energy', 'liveness', 'popularity',
                   'explicit', 'speechiness', 'acousticness', 'instrumentalness', 
                   'loudness', 'tempo', 'danceability', 'mode','valence','id']
df.drop(columns=columns_to_drop, inplace=True)

ordinal_columns = ['year', 'key']
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

for col in numerical_columns:
    df[col].fillna(df[col].median(), inplace=True)

ordinal_transformer = OrdinalEncoder()
scaler = StandardScaler()

numerical_transformer = Pipeline(steps=[
    ('scaler', scaler)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', ordinal_transformer, ordinal_columns)
    ])

X = df.drop('name', axis=1)
y = df['name']
X = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
model.fit(X_train)

# Save raw data as CSV
df.to_csv('artifacts/raw_data.csv', index=False)

# Save train and test data as CSV
train_data = pd.DataFrame(X_train, columns=numerical_columns + ordinal_columns)
train_data['name'] = y_train.values
train_data.to_csv('artifacts/train_data.csv', index=False)

test_data = pd.DataFrame(X_test, columns=numerical_columns + ordinal_columns)
test_data['name'] = y_test.values
test_data.to_csv('artifacts/test_data.csv', index=False)

# Save model
with open('artifacts/model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save preprocessor
with open('artifacts/preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

def find_song_index(song_name, data):
    try:
        index = data[data['name'] == song_name].index[0]
        return index
    except IndexError:
        print(f"Song '{song_name}' not found in the dataset.")
        return None

def recommend_songs(song_name, model, X_train, data):
    song_index = find_song_index(song_name, data)
    if song_index is None:
        return None
    
    # Get the feature vector of the song
    song_vector = X_train[song_index]
    
    # Reshape the vector to be compatible with the model's input shape
    song_vector = song_vector.reshape(1, -1)
    
    # Find the nearest neighbors
    distances, indices = model.kneighbors(song_vector)
    
    # Get the song names of the neighbors
    recommended_songs = data.iloc[indices[0]]['name'].values
    
    return recommended_songs

if __name__ == "__main__":
    song_name = 'Danny Boy'
    recommended_songs = recommend_songs(song_name, model, X_train, df)
    if recommended_songs is not None:
        print(f"Recommended songs for '{song_name}': {recommended_songs}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Recommended songs for 'Danny Boy': ['Danny Boy' 'Lag Chali Bade Saba'
 'Marry The Man Today / Guys And Dolls (Reprise) (from "Guys & Dolls")'
 'Ohio' "'O surdato 'nnammurato - Musical base Version"]
