In [1]:
#import necessary libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [3]:
#import the data into a dataframe called "data"
data = pd.read_csv('data.csv')
data

Unnamed: 0,Lyric,Genre
0,"Handy dandy, controversy surrounds him. He bee...",Rock
1,"Same bed, but it feels just a little bit bigge...",Rock
2,Saw ya out by the pool on the 8th of July. Did...,Pop
3,Got my shit down super tight. Got my shit down...,Hip Hop
4,[talk:]. Ay soulja boy in da buildin. Ay i wan...,Hip Hop
...,...,...
54995,I can't believe what you did to me. Down on my...,unknown
54996,Have all the songs been written?. Have all the...,unknown
54997,Everything you do you do so right. The clothes...,unknown
54998,(trecho). (Rule Number Two. Understanding what...,unknown


In [4]:
#Splitting Data into Training, Testing, and Holdout

training = data.head(50000)
testing = data.tail(5000)
holdout = training.sample(5000, random_state=99) 
training = training.drop(holdout.index) 

In [5]:
#Build a classifier
pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english',lowercase=True,ngram_range=(1, 2),max_df=0.4, min_df=4)),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', MultinomialNB(alpha=0.1))
])

#Train the classifier
pipeline.fit(training['Lyric'], training['Genre'])

#Estimate accuracy of the model on the holdout set
Prediction = pipeline.predict(holdout['Lyric'])
est_acc = accuracy_score(holdout['Genre'],Prediction)
pd.Series(est_acc).to_csv('ea.csv',index=False, header=False)

#Predict testing set
TestPrediction = pipeline.predict(testing['Lyric'])
pd.Series(TestPrediction).to_csv('pred.csv',index=False, header=False)