In [1]:
pip install tensorflow




In [2]:
pip install gensim scikit-learn pandas numpy



In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from keras.regularizers import l2
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, SimpleRNN, GlobalAveragePooling1D, Flatten, Concatenate
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,f1_score, precision_score, recall_score
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle


**Load the Dataset**

In [4]:
# Load the datasets
#train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/train_half.csv')
#test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

#train_df=pd.read_csv("/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/train.csv")
#test_df=pd.read_csv("/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test.csv")


# ***Predict genre based on song lyrics***

Preprocessing

In [5]:
def preprocess(train_df, test_df):
    # Fill missing 'Lyrics' with empty strings
    train_df['Lyrics'] = train_df['Lyrics'].fillna("")
    test_df['Lyrics'] = test_df['Lyrics'].fillna("")

    # Drop rows where 'Lyrics' or 'Genre' is missing in the training set, and 'Lyrics' is missing in the test set
    train_df.dropna(subset=["Lyrics", "Genre"], inplace=True)
    test_df.dropna(subset=["Lyrics"], inplace=True)

    # Convert 'Lyrics' to lowercase
    train_df['Lyrics'] = train_df['Lyrics'].astype(str).str.lower()
    test_df['Lyrics'] = test_df['Lyrics'].astype(str).str.lower()

    return train_df, test_df

In [6]:
def vectorization(train_df,test_df):
  tfidf_vectorizer = TfidfVectorizer(max_features=5000)
  train_vec = tfidf_vectorizer.fit_transform(train_df['Lyrics'])
  test_vec = tfidf_vectorizer.transform(test_df['Lyrics'])
  return train_vec,test_vec

In [7]:
def enc(train_df,test_df):
  label_encoder=LabelEncoder()
  train_encoded_labels=label_encoder.fit_transform(train_df["Genre"])
  test_encoded_labels=label_encoder.transform(test_df["Genre"])
  return train_encoded_labels,test_encoded_labels

In [8]:
def encoding(train_df,test_df):
  label_encoder = LabelEncoder()
  train_encoded_labels = label_encoder.fit_transform(train_df["Genre"])
  train_labels = to_categorical(train_encoded_labels)

   #num_classes = np.max(train_labels) + 1
  test_encoded_labels=label_encoder.transform(test_df['Genre'])
  test_labels = to_categorical(test_encoded_labels)
  return train_labels,test_labels,label_encoder

In [9]:
def tokenizing(train_df,test_df):
  # Tokenize and pad the sequences
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train_df['Lyrics'])
  max_length = max(max(len(s.split()) for s in train_df['Lyrics']), max(len(s.split()) for s in test_df['Lyrics']))
  vocab_size = len(tokenizer.word_index) + 1

  train_sequences = tokenizer.texts_to_sequences(train_df['Lyrics'])
  train_data = pad_sequences(train_sequences, maxlen=max_length)
  test_sequences = tokenizer.texts_to_sequences(test_df['Lyrics'])
  test_data = pad_sequences(test_sequences, maxlen=max_length)
  return train_data,test_data,max_length,vocab_size

In [10]:
def document_vector(word2vec_model, doc):
    doc = [word for word in doc.split() if word in word2vec_model.key_to_index]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model[doc], axis=0)

**SVM**

In [11]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_encoded_labels,test_encoded_labels=enc(train_df,test_df)
train_vec,test_vec=vectorization(train_df,test_df)

svm_classifier = LinearSVC(C=1.0, random_state=42)

#lr= LogisticRegression(max_iter=1000)
svm_classifier.fit(train_vec,train_encoded_labels)

y_pred = svm_classifier.predict(test_vec)
accuracy = accuracy_score(test_encoded_labels, y_pred)
print("Accuracy ",accuracy)
# Evaluate the model
cm = confusion_matrix(test_encoded_labels, y_pred)
print("Confusion matrix",cm)
print("Validation Set Performance:")
print(classification_report(test_encoded_labels, y_pred))


#Save the model
model_file="/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/svm.pkl"
with open(model_file,'wb') as f:
  pickle.dump(svm_classifier,f)

print("Model saved")

Accuracy  0.38461538461538464
Confusion matrix [[0 4 0]
 [0 3 1]
 [0 3 2]]
Validation Set Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.30      0.75      0.43         4
           2       0.67      0.40      0.50         5

    accuracy                           0.38        13
   macro avg       0.32      0.38      0.31        13
weighted avg       0.35      0.38      0.32        13

Model saved


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**RNN**

In [31]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

num_classes = train_labels.shape[1]
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length),
    SimpleRNN(64, return_sequences=False),
    Dense(num_classes, activation='softmax')
])

rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.summary()
#train the model
rnn=rnn_model.fit(train_data, train_labels, epochs=5,batch_size=32,validation_split=0.2)
#find the F1 score
y_pred_prob=rnn_model.predict(test_data)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")
# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
lo, acc = rnn_model.evaluate(test_data, test_labels)
print(f'Test accuracy: {acc}')
print(f'Loss:{lo}')
#Save the model
rnn_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/RNN_new_model.h5")
print("RNN Model saved")

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 868, 50)           382250    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                7360      
                                                                 
 dense_20 (Dense)            (None, 3)                 195       
                                                                 
Total params: 389805 (1.49 MB)
Trainable params: 389805 (1.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 score:0.34294871794871795
Precision: 0.32867132867132864
Recall: 0.46153846153846156
Test accuracy: 0.4615384638309479
Loss:1.234043002128601


  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


RNN Model saved


**LSTM**

In [30]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

lstm_model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=100, input_length=max_length),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2,recurrent_dropout=0.2, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)),
    Dense(len(label_encoder.classes_), activation='softmax',kernel_regularizer=l2(0.01))
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()
#train the model
lstm=lstm_model.fit(train_data,train_labels, epochs=5, batch_size=32, validation_split=0.2, verbose=2)
#find the F1 score
y_pred_prob=lstm_model.predict(test_data)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
#check the accuracy
test_loss, test_accuracy = lstm_model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')
#Save the model
lstm_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/LSTM_model.h5")
print("LSTM Model saved")



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 868, 100)          764500    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 868, 100)          0         
 alDropout1D)                                                    
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_19 (Dense)            (None, 3)                 303       
                                                                 
Total params: 845203 (3.22 MB)
Trainable params: 845203 (3.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
19/19 - 64s - loss: 3.0994 - accuracy: 0.4726 - val_loss: 2.1256 - val_acc

  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy: 0.38461539149284363
Loss:1.5718786716461182


  saving_api.save_model(


LSTM Model saved


**CNN**

In [32]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

cnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(units=10, activation='relu'),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])
cnn_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
cnn_model.summary()
#train the model
cnn= cnn_model.fit(train_data,train_labels, epochs=5, batch_size=32, validation_split=0.2, verbose=2)
#find the F1 score
y_pred_prob=cnn_model.predict(test_data)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = cnn_model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')
#save the model
cnn_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/CNN_model.h5")
print("CNN Model saved")

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 868, 100)          764500    
                                                                 
 conv1d_3 (Conv1D)           (None, 864, 128)          64128     
                                                                 
 global_max_pooling1d_3 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_21 (Dense)            (None, 10)                1290      
                                                                 
 dense_22 (Dense)            (None, 3)                 33        
                                                                 
Total params: 829951 (3.17 MB)
Trainable params: 829951 (3.17 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


CNN Model saved


**Embedding on the fly**

In [15]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

max_features = 5000
maxlen = 400
embedding_dims = 50

emb_model = Sequential()
embedding_size=8
emb_model.add(Embedding(max_features,embedding_size,input_length=max_length))
emb_model.add(Flatten())
emb_model.add(Dense(200, activation='relu'))
emb_model.add(Dense(3,activation='softmax'))

emb_model.summary()

emb_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

emb = emb_model.fit(train_data, train_labels,batch_size=32,epochs=5,validation_split=0.2)

#find the F1 score
y_pred_prob=emb_model.predict(test_data)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = emb_model.evaluate(test_data, test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')

#Save the model
emb_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/emb_model.h5")
print("Embedding on the fly Model saved")

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 868, 8)            40000     
                                                                 
 flatten (Flatten)           (None, 6944)              0         
                                                                 
 dense_4 (Dense)             (None, 200)               1389000   
                                                                 
 dense_5 (Dense)             (None, 3)                 603       
                                                                 
Total params: 1429603 (5.45 MB)
Trainable params: 1429603 (5.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 score:0.3153846153846154
Precision: 0.2673992673992674
Recall: 0.38461538461538464
Test accurac

  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


Embedding on the fly Model saved


**Pretrained word embedding**

In [16]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

word2vec_path = '/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/GoogleNews-vectors-negative300.bin'  # Example for the same directory; adjust as needed
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
#def document_vector(word2vec_model, doc):
#    doc = [word for word in doc.split() if word in word2vec_model.key_to_index]
#    if not doc:
#        return np.zeros(word2vec_model.vector_size)
#    return np.mean(word2vec_model[doc], axis=0)

train_df['vec'] = train_df['Lyrics'].apply(lambda x: document_vector(word2vec, x))
test_df['vec']=test_df["Lyrics"].apply(lambda x: document_vector(word2vec, x))

X = np.array(train_df['vec'].tolist())
y = np.array(test_df['vec'].tolist())


pre_model = Sequential()
#No need for an embedding layer
pre_model.add(Dense(200, activation='relu',input_shape=(300,)))
pre_model.add(Dense(3,activation='softmax'))

pre_model.summary()

pre_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

pre = pre_model.fit(X, train_labels,batch_size=32,epochs=3,validation_split=0.2)

#find the F1 score
y_pred_prob=emb_model.predict(test_data)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = pre_model.evaluate(y, test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')

#Save the model
pre_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/pre_model.h5")
print("Pretrained word Model saved")

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 200)               60200     
                                                                 
 dense_7 (Dense)             (None, 3)                 603       
                                                                 
Total params: 60803 (237.51 KB)
Trainable params: 60803 (237.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
F1 score:0.3153846153846154
Precision: 0.2673992673992674
Recall: 0.38461538461538464
Test accuracy: 0.3076923191547394
Loss:1.5274726152420044


  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


Pretrained word Model saved


**Pretrained word embedding and logisitic regression**

In [17]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess(train_df,test_df)
train_labels,test_labels,label_encoder=encoding(train_df,test_df)
train_data,test_data,max_length,vocab_size=tokenizing(train_df,test_df)

#!unzip '/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/GoogleNews-vectors-negative300.bin.zip' -d '/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/'
word2vec_path = '/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/GoogleNews-vectors-negative300.bin'  # Example for the same directory; adjust as needed
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
#def document_vector(word2vec_model, doc):
#    doc = [word for word in doc.split() if word in word2vec_model.key_to_index]
#    if not doc:
#        return np.zeros(word2vec_model.vector_size)
#    return np.mean(word2vec_model[doc], axis=0)

train_df['vec'] = train_df['Lyrics'].apply(lambda x: document_vector(word2vec, x))
test_df['vec']=test_df["Lyrics"].apply(lambda x: document_vector(word2vec, x))

X = np.array(train_df['vec'].tolist())
y = np.array(test_df['vec'].tolist())

#lr_model = LogisticRegression(max_iter=1000)
#lr_model.fit(X,train_encoded_labels)

svm = LinearSVC(C=1.0, random_state=42)

#lr= LogisticRegression(max_iter=1000)
svm.fit(X,train_encoded_labels)
y_pred = svm.predict(y)
accuracy = accuracy_score(test_encoded_labels, y_pred)
print("Accuracy ",accuracy)
# Evaluate the model
cm = confusion_matrix(test_encoded_labels, y_pred)
print("Confusion matrix",cm)
print("Validation Set Performance:")
print(classification_report(test_encoded_labels, y_pred))


#Save the model
model_file="/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/l_reg.pkl"
with open(model_file,'wb') as f:
  pickle.dump(svm,f)

print("Model saved")

Accuracy  0.38461538461538464
Confusion matrix [[0 3 1]
 [0 3 1]
 [0 3 2]]
Validation Set Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.33      0.75      0.46         4
           2       0.50      0.40      0.44         5

    accuracy                           0.38        13
   macro avg       0.28      0.38      0.30        13
weighted avg       0.29      0.38      0.31        13



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model saved


# ***Predict genre based on song lyrics and artist***

In [18]:
def preprocess_2(train_df, test_df):
    # Fill missing 'Lyrics' with empty strings
    train_df['Lyrics'] = train_df['Lyrics'].fillna("")
    test_df['Lyrics'] = test_df['Lyrics'].fillna("")
    train_df["Artist"]=train_df["Artist"].fillna("")
    test_df["Artist"]=test_df["Artist"].fillna("")

    # Drop rows where 'Lyrics' or 'Genre' is missing in the training set, and 'Lyrics' is missing in the test set
    train_df.dropna(subset=["Lyrics","Artist","Genre"], inplace=True)
    test_df.dropna(subset=["Lyrics","Artist"], inplace=True)


    # Convert 'Lyrics' to lowercase
    train_df['Lyrics'] = train_df['Lyrics'].astype(str).str.lower()
    test_df['Lyrics'] = test_df['Lyrics'].astype(str).str.lower()
    train_df['Artist'] = train_df['Artist'].astype(str).str.lower()
    test_df['Artist'] = test_df['Artist'].astype(str).str.lower()
    return train_df, test_df

In [19]:
def vectorization_2(train_df,test_df):
  # Combine 'Lyrics' and 'Artist' into a single string per row for both train and test DataFrames
  train_combined = train_df['Lyrics'] + " " + train_df['Artist']
  test_combined = test_df['Lyrics'] + " " + test_df['Artist']
  tfidf_vectorizer = TfidfVectorizer(max_features=5000)
  train_vec = tfidf_vectorizer.fit_transform(train_combined)
  test_vec = tfidf_vectorizer.transform(test_combined)
  return train_vec,test_vec

In [20]:
def enc_2(train_df,test_df):
  label_encoder=LabelEncoder()
  train_encoded_labels=label_encoder.fit_transform(train_df["Genre"])
  test_encoded_labels=label_encoder.transform(test_df["Genre"])
  return train_encoded_labels,test_encoded_labels

In [21]:
def encoding_2(train_df,test_df):
  label_encoder = LabelEncoder()
  train_encoded_labels = label_encoder.fit_transform(train_df["Genre"])
  train_labels = to_categorical(train_encoded_labels)

   #num_classes = np.max(train_labels) + 1
  test_encoded_labels=label_encoder.transform(test_df['Genre'])
  test_labels = to_categorical(test_encoded_labels)
  return train_labels,test_labels,label_encoder

In [22]:
def tokenizing_2(train_df, test_df):
    # Initialize tokenizers
    lyrics_tokenizer = Tokenizer()
    artist_tokenizer = Tokenizer()

    # Fit the tokenizers
    lyrics_tokenizer.fit_on_texts(train_df['Lyrics'])
    artist_tokenizer.fit_on_texts(train_df['Artist'])

    # Find the maximum length for padding
    lyrics_max_length = max(max(len(s.split()) for s in train_df['Lyrics']),
                            max(len(s.split()) for s in test_df['Lyrics']))
    artist_max_length = max(max(len(s.split()) for s in train_df['Artist']),
                            max(len(s.split()) for s in test_df['Artist']))

    # Vocabulary sizes
    lyrics_vocab_size = len(lyrics_tokenizer.word_index) + 1
    artist_vocab_size = len(artist_tokenizer.word_index) + 1

    # Tokenize and pad the sequences
    # Lyrics
    train_lyrics_sequences = lyrics_tokenizer.texts_to_sequences(train_df['Lyrics'])
    train_lyrics_data = pad_sequences(train_lyrics_sequences, maxlen=lyrics_max_length)
    test_lyrics_sequences = lyrics_tokenizer.texts_to_sequences(test_df['Lyrics'])
    test_lyrics_data = pad_sequences(test_lyrics_sequences, maxlen=lyrics_max_length)

    # Artist
    train_artist_sequences = artist_tokenizer.texts_to_sequences(train_df['Artist'])
    train_artist_data = pad_sequences(train_artist_sequences, maxlen=artist_max_length)
    test_artist_sequences = artist_tokenizer.texts_to_sequences(test_df['Artist'])
    test_artist_data = pad_sequences(test_artist_sequences, maxlen=artist_max_length)

    return (train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data,
            lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size)

***SVM***

In [23]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_encoded_labels,test_encoded_labels=enc_2(train_df,test_df)
train_vec,test_vec=vectorization_2(train_df,test_df)

svm_classifier = LinearSVC(C=1.0, random_state=42)

#lr= LogisticRegression(max_iter=1000)
svm_classifier.fit(train_vec,train_encoded_labels)

y_pred = svm_classifier.predict(test_vec)
accuracy = accuracy_score(test_encoded_labels, y_pred)
print("Accuracy ",accuracy)
# Evaluate the model
cm = confusion_matrix(test_encoded_labels, y_pred)
print("Confusion matrix",cm)
print("Validation Set Performance:")
print(classification_report(test_encoded_labels, y_pred))


#Save the model
model_file="/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/svm.pkl"
with open(model_file,'wb') as f:
  pickle.dump(svm_classifier,f)

print("Model saved")

Accuracy  0.46153846153846156
Confusion matrix [[0 4 0]
 [0 3 1]
 [0 2 3]]
Validation Set Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.33      0.75      0.46         4
           2       0.75      0.60      0.67         5

    accuracy                           0.46        13
   macro avg       0.36      0.45      0.38        13
weighted avg       0.39      0.46      0.40        13

Model saved


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


***RNN***

In [33]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_labels,test_labels,label_encoder=encoding_2(train_df,test_df)
train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data, lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size=tokenizing_2(train_df, test_df)

num_classes = train_labels.shape[1]
# Define inputs
lyrics_input = Input(shape=(lyrics_max_length,), dtype='int32', name='lyrics_input')
artist_input = Input(shape=(artist_max_length,), dtype='int32', name='artist_input')

# Embeddings
lyrics_embedding = Embedding(input_dim=lyrics_vocab_size, output_dim=50, input_length=lyrics_max_length)(lyrics_input)
artist_embedding = Embedding(input_dim=artist_vocab_size, output_dim=50, input_length=artist_max_length)(artist_input)

# Process Lyrics Embedding
lyrics_rnn = SimpleRNN(64)(lyrics_embedding)  # Consider using LSTM or GRU for better performance

# Process Artist Embedding
artist_rnn = SimpleRNN(64)(artist_embedding)  # Same here, LSTM or GRU could be more appropriate

# Merge the outputs from both RNNs
merged = Concatenate(axis=-1)([lyrics_rnn, artist_rnn])

# Output layer
output = Dense(num_classes, activation='softmax')(merged)

# Build and compile the model as before
rnn_model = Model(inputs=[lyrics_input, artist_input], outputs=output)
rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Now, train the model. Make sure to pass the inputs as a list in the same order as defined.
rnn_model.fit([train_lyrics_data, train_artist_data], train_labels, epochs=5, batch_size=32, validation_split=0.2)
#find the F1 score
y_pred_prob=rnn_model.predict([test_lyrics_data, test_artist_data])
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
lo, acc = rnn_model.evaluate([test_lyrics_data, test_artist_data], test_labels)
print(f'Test accuracy: {acc}')
print(f'Loss:{lo}')
#Save the model
rnn_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/RNN_new_model_lyrics_artist.h5")
print("RNN Model saved")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
F1 score:0.21367521367521367
Precision: 0.14792899408284024
Recall: 0.38461538461538464


  _warn_prf(average, modifier, msg_start, len(result))


Test accuracy: 0.38461539149284363
Loss:2.085087299346924


  saving_api.save_model(


RNN Model saved


***LSTM***

In [29]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_labels,test_labels,label_encoder=encoding_2(train_df,test_df)
train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data, lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size=tokenizing_2(train_df, test_df)


# Define two inputs
lyrics_input = Input(shape=(lyrics_max_length,), dtype='int32', name='lyrics_input')
artist_input = Input(shape=(artist_max_length,), dtype='int32', name='artist_input')

# Embeddings
lyrics_embedding = Embedding(input_dim=lyrics_vocab_size, output_dim=100, input_length=lyrics_max_length)(lyrics_input)
artist_embedding = Embedding(input_dim=artist_vocab_size, output_dim=100, input_length=artist_max_length)(artist_input)

# LSTM layers
lyrics_lstm = LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01))(lyrics_embedding)
artist_lstm = LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01))(artist_embedding)

# Concatenate the outputs of the two LSTMs
concatenated = Concatenate()([lyrics_lstm, artist_lstm])

# Add one or more dense layers
dense_layer = Dense(100, activation='relu', kernel_regularizer=l2(0.01))(concatenated)

# Output layer
output = Dense(len(label_encoder.classes_), activation='softmax')(dense_layer)

# Create the model
lstm_model = Model(inputs=[lyrics_input, artist_input], outputs=output)

# Compile the model
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
lstm_model.summary()

# Train the model
# Note: You need to pass a list of numpy arrays as training data: one for lyrics and one for artists
lstm_model.fit([train_lyrics_data, train_artist_data], train_labels, epochs=5, batch_size=32, validation_split=0.2, verbose=2)

#find the F1 score
y_pred_prob=lstm_model.predict([test_lyrics_data, test_artist_data])
y_true=np.argmax(test_labels,axis=1)
y_pred=np.argmax(y_pred_prob,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = lstm_model.evaluate([test_lyrics_data, test_artist_data], test_labels)
print(f'Loss:{test_loss}')
print(f'Test accuracy: {test_accuracy}')
#Save the model
lstm_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/LSTM_model_lyrics_artist.h5")
print("LSTM Model saved")



Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lyrics_input (InputLayer)   [(None, 868)]                0         []                            
                                                                                                  
 artist_input (InputLayer)   [(None, 3)]                  0         []                            
                                                                                                  
 embedding_12 (Embedding)    (None, 868, 100)             764500    ['lyrics_input[0][0]']        
                                                                                                  
 embedding_13 (Embedding)    (None, 3, 100)               1400      ['artist_input[0][0]']        
                                                                                            

  _warn_prf(average, modifier, msg_start, len(result))


Loss:2.04223370552063
Test accuracy: 0.38461539149284363


  saving_api.save_model(


LSTM Model saved


***CNN***

In [34]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_labels,test_labels,label_encoder=encoding_2(train_df,test_df)
train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data, lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size=tokenizing_2(train_df, test_df)

# Inputs
lyrics_input = Input(shape=(lyrics_max_length,), dtype='int32', name='lyrics_input')
artist_input = Input(shape=(artist_max_length,), dtype='int32', name='artist_input')

# Embeddings
lyrics_embedding = Embedding(input_dim=lyrics_vocab_size, output_dim=100, input_length=lyrics_max_length)(lyrics_input)
artist_embedding = Embedding(input_dim=artist_vocab_size, output_dim=100, input_length=artist_max_length)(artist_input)


# CNN layers for each input
lyrics_cnn = Conv1D(filters=128, kernel_size=5, activation='relu')(lyrics_embedding)
lyrics_pooling = GlobalMaxPooling1D()(lyrics_cnn)
artist_cnn = Conv1D(filters=128, kernel_size=3, activation='relu')(artist_embedding)
artist_pooling = GlobalMaxPooling1D()(artist_cnn)

# Concatenate the outputs of the two CNN paths
concatenated = Concatenate()([lyrics_pooling, artist_pooling])

# Dense layers
dense_layer = Dense(units=10, activation='relu')(concatenated)

# Output layer
output = Dense(units=len(label_encoder.classes_), activation='softmax')(dense_layer)

# Create the model
cnn_model = Model(inputs=[lyrics_input, artist_input], outputs=output)

# Compile the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
cnn_model.summary()

# Train the model
cnn_model.fit([train_lyrics_data, train_artist_data], train_labels, epochs=5, batch_size=32, validation_split=0.2, verbose=2)
#find the F1 score
y_pred_prob=cnn_model.predict([test_lyrics_data, test_artist_data])
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = cnn_model.evaluate([test_lyrics_data, test_artist_data], test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')
#save the model
cnn_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/CNN_model_lyrics_artist.h5")
print("CNN Model saved")

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lyrics_input (InputLayer)   [(None, 868)]                0         []                            
                                                                                                  
 artist_input (InputLayer)   [(None, 3)]                  0         []                            
                                                                                                  
 embedding_19 (Embedding)    (None, 868, 100)             764500    ['lyrics_input[0][0]']        
                                                                                                  
 embedding_20 (Embedding)    (None, 3, 100)               1400      ['artist_input[0][0]']        
                                                                                            

  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


CNN Model saved


***Embedding on the fly***

In [35]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_labels,test_labels,label_encoder=encoding_2(train_df,test_df)
train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data, lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size=tokenizing_2(train_df, test_df)

embedding_dims=50
# Define model parameters
lyrics_input = Input(shape=(lyrics_max_length,), dtype='int32', name='lyrics_input')
artist_input = Input(shape=(artist_max_length,), dtype='int32', name='artist_input')

# Embedding layers
lyrics_embedding = Embedding(input_dim=lyrics_vocab_size, output_dim=embedding_dims, input_length=lyrics_max_length)(lyrics_input)
artist_embedding = Embedding(input_dim=artist_vocab_size, output_dim=embedding_dims, input_length=artist_max_length)(artist_input)

# Flatten the embeddings
lyrics_flatten = Flatten()(lyrics_embedding)
artist_flatten = Flatten()(artist_embedding)

# Concatenate the flattened outputs
concatenated = Concatenate()([lyrics_flatten, artist_flatten])

# Dense layers
dense_layer = Dense(200, activation='relu')(concatenated)
output = Dense(len(label_encoder.classes_), activation='softmax')(dense_layer)

# Create the model
emb_model = Model(inputs=[lyrics_input, artist_input], outputs=output)

# Compile the model
emb_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
emb_model.summary()

# Train the model
emb_model.fit([train_lyrics_data, train_artist_data], train_labels, batch_size=32, epochs=4, validation_split=0.2)


#find the F1 score
y_pred_prob=emb_model.predict([test_lyrics_data, test_artist_data])
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = emb_model.evaluate([test_lyrics_data, test_artist_data], test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')

#Save the model
emb_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/emb_model_lyrics_artist.h5")
print("Embedding on the fly Model saved")

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lyrics_input (InputLayer)   [(None, 868)]                0         []                            
                                                                                                  
 artist_input (InputLayer)   [(None, 3)]                  0         []                            
                                                                                                  
 embedding_21 (Embedding)    (None, 868, 50)              382250    ['lyrics_input[0][0]']        
                                                                                                  
 embedding_22 (Embedding)    (None, 3, 50)                700       ['artist_input[0][0]']        
                                                                                            

  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


Embedding on the fly Model saved


***Pretrained word embedding***

In [36]:
train_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/Part1_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Deep_Learning_Assignment/dataset/Multilingual/test_half.csv')

train_df,test_df=preprocess_2(train_df,test_df)
train_labels,test_labels,label_encoder=encoding_2(train_df,test_df)
train_lyrics_data, train_artist_data, test_lyrics_data, test_artist_data, lyrics_max_length, artist_max_length, lyrics_vocab_size, artist_vocab_size=tokenizing_2(train_df, test_df)


word2vec_path = '/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/GoogleNews-vectors-negative300.bin'  # Example for the same directory; adjust as needed
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
#def document_vector(word2vec_model, doc):
#    doc = [word for word in doc.split() if word in word2vec_model.key_to_index]
#    if not doc:
#        return np.zeros(word2vec_model.vector_size)
#    return np.mean(word2vec_model[doc], axis=0)

train_df['vec'] = train_df['Lyrics'].apply(lambda x: document_vector(word2vec, x))
test_df['vec']=test_df["Lyrics"].apply(lambda x: document_vector(word2vec, x))

from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")  # Ensure output is a dense matrix

# Fit and transform the artist names
# Reshape(-1, 1) is used to convert the series into a shape that OneHotEncoder expects: a 2D array
artist_train_encoded = encoder.fit_transform(train_df[['Artist']].values.reshape(-1, 1))
artist_test_encoded = encoder.transform(test_df[['Artist']].values.reshape(-1, 1))

X = np.array(train_df['vec'].tolist())
y = np.array(test_df['vec'].tolist())

X_combined=np.concatenate([X, artist_train_encoded], axis=1)
y_combined=np.concatenate([y, artist_test_encoded], axis=1)
input_shape = X_combined.shape[1]


pre_model = Sequential()
#No need for an embedding layer
pre_model.add(Dense(200, activation='relu',input_shape=(input_shape,)))
pre_model.add(Dense(3,activation='softmax'))

pre_model.summary()

pre_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

pre = pre_model.fit(X_combined, train_labels,batch_size=32,epochs=4,validation_split=0.2)

#find the F1 score
y_pred_prob=pre_model.predict(y_combined)
y_pred=np.argmax(y_pred_prob,axis=1)
y_true=np.argmax(test_labels,axis=1)
f1=f1_score(y_true,y_pred,average='weighted')
print(f"F1 score:{f1}")

# Calculate Precision and Recall
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

#check the accuracy
test_loss, test_accuracy = pre_model.evaluate(y_combined, test_labels)
print(f'Test accuracy: {test_accuracy}')
print(f'Loss:{test_loss}')

#Save the model
pre_model.save("/content/drive/MyDrive/Deep_Learning_Assignment/saved_models/pre_model_Lyrics_Artist.h5")
print("Pretrained word Model saved")



Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 200)               61600     
                                                                 
 dense_29 (Dense)            (None, 3)                 603       
                                                                 
Total params: 62203 (242.98 KB)
Trainable params: 62203 (242.98 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
F1 score:0.15384615384615385
Precision: 0.10256410256410256
Recall: 0.3076923076923077
Test accuracy: 0.3076923191547394
Loss:1.4007920026779175


  _warn_prf(average, modifier, msg_start, len(result))
  saving_api.save_model(


Pretrained word Model saved
