<a href="https://colab.research.google.com/github/Geeth-Rath/A-ML/blob/main/lyrics_model_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Train with merged dataset

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    RegexTokenizer, StopWordsRemover, HashingTF, StringIndexer
)
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. Start Spark session
spark = SparkSession.builder \
    .appName("LyricsGenreClassifier_NaiveBayes") \
    .getOrCreate()

# 2. Load dataset
data = spark.read.csv("Merged dataset.csv", header=True, inferSchema=True)

# 3. Select only necessary columns
columns_to_keep = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']
data = data.select(*columns_to_keep)

# 4. Drop rows with missing values
data = data.dropna(subset=["lyrics", "genre", "release_date"])

# 5. Balance dataset: 2450 per genre
genres = data.select("genre").distinct().rdd.flatMap(lambda x: x).collect()
balanced_data = None
for g in genres:
    genre_df = data.filter(col("genre") == g).limit(1000)
    balanced_data = genre_df if balanced_data is None else balanced_data.union(genre_df)

# 6. Label encoding
label_indexer = StringIndexer(inputCol="genre", outputCol="label")

# 7. Text features (no IDF)
tokenizer = RegexTokenizer(inputCol="lyrics", outputCol="tokens", pattern="\\W")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tf = HashingTF(inputCol="filtered", outputCol="features", numFeatures=10000)

# 8. Classifier
classifier = NaiveBayes(featuresCol="features", labelCol="label", modelType="multinomial")

# 9. Build pipeline
pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, tf, classifier])

# 10. Train/test split
train_data, test_data = balanced_data.randomSplit([0.8, 0.2], seed=42)

# 11. Train model
model = pipeline.fit(train_data)

# 12. Evaluate model
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(
    metricName="accuracy",
    labelCol="label",
    predictionCol="prediction"
)
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy (Naive Bayes): {accuracy:.2f}")

Test Accuracy (Naive Bayes): 0.72


In [67]:
model.save("/content/drive/MyDrive/saved_models/lyrics_genre_classifier_with merge_dataset")

In [68]:
import shutil

model_path = "/content/drive/MyDrive/saved_models/lyrics_genre_classifier_with merge_dataset"
zip_path = "lyrics_genre_model_merge.zip"

# Zip the model folder
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', model_path)



'/content/lyrics_genre_model_merge.zip'