<a href="https://colab.research.google.com/github/Jeremy-Min-Yang/MusicSuggestion/blob/main/music_suggestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**GRABBING THE DATASET**

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

file_path = '/content/drive/My Drive/Colab Notebooks/data/dataset.csv'
music_data = pd.read_csv(file_path)


**DATA PREPROCESSING**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Select relevant columns for preprocessing
selected_columns = [
    'track_id', 'track_name', 'artists', 'track_genre',
    'danceability', 'energy', 'valence', 'tempo',
    'acousticness', 'popularity'
]
preprocessed_dataset = music_data[selected_columns]

# Check for missing values and drop rows with missing data
preprocessed_dataset = preprocessed_dataset.dropna()

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['danceability', 'energy', 'valence', 'tempo', 'acousticness', 'popularity']
preprocessed_dataset[numerical_features] = scaler.fit_transform(preprocessed_dataset[numerical_features])

# Check for duplicates and drop them
preprocessed_dataset = preprocessed_dataset.drop_duplicates()

# Save preprocessed dataset to a new CSV file
output_path = 'preprocessed_spotify_dataset.csv'  # Change to your desired output path
preprocessed_dataset.to_csv(output_path, index=False)

print(f"Preprocessed dataset saved to {output_path}")


Preprocessed dataset saved to preprocessed_spotify_dataset.csv


**TRAINING USER INPUT PROCESSING MODEL**

In [None]:
file_path = '/content/drive/My Drive/Colab Notebooks/data/Categorized_Sentences.csv'
input_data = pd.read_csv(file_path)

use different classification models/vectorization tecniques (tf-df/count-vectorizer)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Drop rows with missing values
input_data = input_data.dropna()

# Preprocess text
input_data['Phrase'] = input_data['Phrase'].str.lower()

# Split into training and testing sets
X = input_data['Phrase']
y = input_data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
joblib.dump(clf, 'phrase_classifier.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data['Phrase'] = input_data['Phrase'].str.lower()


              precision    recall  f1-score   support

    activity       0.69      0.60      0.64        15
    location       0.71      0.79      0.75        19
        mood       0.90      0.90      0.90        20
     weather       0.92      0.92      0.92        26

    accuracy                           0.82        80
   macro avg       0.81      0.80      0.80        80
weighted avg       0.82      0.82      0.82        80



['vectorizer.pkl']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train a Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = log_reg.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    activity       0.80      0.80      0.80        15
    location       0.83      0.79      0.81        19
        mood       0.85      0.85      0.85        20
     weather       0.89      0.92      0.91        26

    accuracy                           0.85        80
   macro avg       0.84      0.84      0.84        80
weighted avg       0.85      0.85      0.85        80



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Transform text into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

from sklearn.neural_network import MLPClassifier

# Train an MLPClassifier
mlp_model = MLPClassifier(random_state=42, hidden_layer_sizes=(100, 50), max_iter=500)
mlp_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = mlp_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    activity       0.65      0.73      0.69        15
    location       0.78      0.74      0.76        19
        mood       0.95      0.90      0.92        20
     weather       0.96      0.96      0.96        26

    accuracy                           0.85        80
   macro avg       0.83      0.83      0.83        80
weighted avg       0.86      0.85      0.85        80



**PROCESSING USER INPUT INTO ENVIRONMENT/EMOTION**

In [None]:
import joblib

# Load trained model and vectorizer
phrase_classifier = joblib.load('phrase_classifier.pkl')
vectorizer = joblib.load('vectorizer.pkl')

def classify_sentences(sentences):
    # Vectorize sentences for classification
    sentences_vec = vectorizer.transform(sentences)
    predictions = phrase_classifier.predict(sentences_vec)
    return predictions

def process_user_input_with_model(user_input):
    # Use SpaCy for sentence segmentation
    doc = nlp(user_input)
    sentences = [sentence.text for sentence in doc.sents]

    # Classify each sentence
    predictions = classify_sentences(sentences)

    # Group sentences by predicted category
    environment = {"weather": [], "activity": [], "location": []}
    mood = []

    for sentence, prediction in zip(sentences, predictions):
        if prediction == "mood":
            mood.append(sentence)
        elif prediction in environment:
            environment[prediction].append(sentence)

    # Combine sentences into strings
    mood_text = " ".join(mood)
    for key in environment:
        environment[key] = " ".join(environment[key])

    return environment, mood_text


**EXTRACT ENTITIES AND KEYWORDS**

**WEATHER KEYWORDS/ACTIVITY KEYWORDS ARE HARDCODED LISTS AND NARROW DOWN THE SCOPE**

ALTERNATIVE: TRAIN OUR OWN MODEL

In [None]:
# import spacy
# from nltk.sentiment.vader import SentimentIntensityAnalyzer


# # Load SpaCy and sentiment analysis tools
# nlp = spacy.load("en_core_web_sm")
# sid = SentimentIntensityAnalyzer()

# # List of weather keywords and activities for keyword matching
# weather_keywords = ["snowing", "raining", "sunny", "cloudy", "stormy", "windy"]
# activity_keywords = ["walking", "running", "sitting", "driving", "working", "relaxing"]

# def extract_entities_and_keywords(user_input):
#     # Process input with SpaCy
#     doc = nlp(user_input)

#     # Extract entities
#     location = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]  # Location
#     time = [ent.text for ent in doc.ents if ent.label_ == "TIME"]             # Time
#     date = [ent.text for ent in doc.ents if ent.label_ == "DATE"]             # Date

#     # Extract keywords
#     weather = [token.text for token in doc if token.text.lower() in weather_keywords]
#     activity = [token.text for token in doc if token.text.lower() in activity_keywords]

#     return location, time, date, weather, activity

# def analyze_sentiment(user_input):
#     # Use NLTK's VADER sentiment analyzer
#     sentiment_scores = sid.polarity_scores(user_input)
#     if sentiment_scores["compound"] > 0.2:
#         mood = "positive"
#     elif sentiment_scores["compound"] < -0.2:
#         mood = "negative"
#     else:
#         mood = "neutral"
#     return mood

# def process_user_input(user_input):
#     # Extract environment entities and keywords
#     location, time, date, weather, activity = extract_entities_and_keywords(user_input)

#     # Analyze sentiment for mood
#     mood = analyze_sentiment(user_input)

#     # Compile environment description
#     environment = {
#         "location": location,
#         "time": time,
#         "date": date,
#         "weather": weather,
#         "activity": activity
#     }

#     return environment, mood

# # Process the input
# environment, mood = process_user_input(user_input)

# # Print results
# print("Environment:", environment)
# print("Mood:", mood)


In [None]:
# from spacy import displacy

# def visualize_entities(user_input):
#     doc = nlp(user_input)
#     displacy.render(doc, style="ent", jupyter=True)  # Visualize entities

# def visualize_dependencies(user_input):
#     doc = nlp(user_input)
#     displacy.render(doc, style="dep", jupyter=True)  # Visualize dependencies


# # Call visualization functions
# visualize_entities(user_input)  # View named entities
# visualize_dependencies(user_input)  # View dependency tree



NameError: name 'user_input' is not defined

In [None]:
def map_inputs_to_features(environment, mood):
    # Initialize default feature values
    features = {
        "valence": 0.5,  # Neutral positivity
        "energy": 0.5,   # Neutral energy
        "tempo": 120,    # Moderate tempo
        "acousticness": 0.5,  # Balanced acousticness
        "genre": "pop",  # Default genre
    }

    # Map mood to valence and energy
    if mood == "positive":
        features["valence"] = 0.8
        features["energy"] = 0.6
    elif mood == "negative":
        features["valence"] = 0.3
        features["energy"] = 0.4
    elif mood == "neutral":
        features["valence"] = 0.5
        features["energy"] = 0.5

    # Map environment details
    if "snowing" in environment["weather"]:
        features["acousticness"] = 0.8
        features["genre"] = "acoustic"
    elif "raining" in environment["weather"]:
        features["acousticness"] = 0.7
        features["genre"] = "ambient"

    if "walking" in environment["activity"]:
        features["tempo"] = 100
        features["energy"] = 0.5
    elif "running" in environment["activity"]:
        features["tempo"] = 140
        features["energy"] = 0.8

    return features

In [None]:
# user_features = map_inputs_to_features(environment, mood)

# print("Mapped Features:", user_features)

NameError: name 'environment' is not defined

In [None]:
def filter_songs(preprocessed_dataset, user_features):
    filtered_songs = preprocessed_dataset[
        (preprocessed_dataset["valence"] >= user_features["valence"] - 0.2) &
        (preprocessed_dataset["valence"] <= user_features["valence"] + 0.2) &
        (preprocessed_dataset["energy"] >= user_features["energy"] - 0.2) &
        (preprocessed_dataset["energy"] <= user_features["energy"] + 0.2) &
        (preprocessed_dataset["tempo"] >= user_features["tempo"] - 20) &
        (preprocessed_dataset["tempo"] <= user_features["tempo"] + 20) &
        (preprocessed_dataset["track_genre"] == user_features["genre"])
    ]
    return filtered_songs


In [None]:
def recommend_songs(user_input, preprocessed_dataset):
    # Process user input using the updated function
    environment, mood = process_user_input_with_model(user_input)

    # Map user inputs to feature values
    user_features = map_inputs_to_features(environment, mood)
    print("Mapped Features for Filtering:", user_features)

    # Filter songs from the dataset
    recommendations = filter_songs(preprocessed_dataset, user_features).head(10)

    # Handle case where no songs are found
    if recommendations.empty:
        print("No exact matches found. Recommending popular tracks instead.")
        recommendations = preprocessed_dataset.sort_values(by="popularity", ascending=False).head(10)

    return recommendations


In [None]:
# Example user input
user_input = "I am feeling stressed after a long day. It's snowing and freezing outside in Boston."

# Recommend songs
recommended_songs = recommend_songs(user_input, preprocessed_dataset)

# Display results
print("Recommended Songs:")
print(recommended_songs[["track_name", "artists", "track_genre", "popularity"]])


Mapped Features for Filtering: {'valence': 0.5, 'energy': 0.5, 'tempo': 120, 'acousticness': 0.8, 'genre': 'acoustic'}
No exact matches found. Recommending popular tracks instead.
Recommended Songs:
                                  track_name                  artists  \
81051              Unholy (feat. Kim Petras)     Sam Smith;Kim Petras   
20001              Unholy (feat. Kim Petras)     Sam Smith;Kim Petras   
51664  Quevedo: Bzrp Music Sessions, Vol. 52         Bizarrap;Quevedo   
68303                             La Bachata            Manuel Turizo   
81210                        I'm Good (Blue)  David Guetta;Bebe Rexha   
67356                             La Bachata            Manuel Turizo   
88410                             La Bachata            Manuel Turizo   
30003                        I'm Good (Blue)  David Guetta;Bebe Rexha   
89411                             La Bachata            Manuel Turizo   
20008                        I'm Good (Blue)  David Guetta;Bebe Rexha  