In [1]:
import gzip
import json
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, MultinomialNB ,BernoulliNB
from Load import json_to_df
import preprocess
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

# Dataset

In [3]:
df = pd.read_csv("data\\music_reviews.csv")
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
0,US,10140119,R3LI5TRP3YIDQL,B00TXH4OLC,384427924,Whatever's for Us: Remastered,Music,5,0,0,N,Y,Five Stars,Love this CD along with other CDs by the same ...,2015-08-31,positive
1,US,27664622,R3LGC3EKEG84PX,B00B6QXN6U,831769051,Same Trailer Different Park,Music,5,0,0,N,Y,A new fave in our house,This is the album that introduced me to Kacey ...,2015-08-31,positive
2,US,45946560,R9PYL3OYH55QY,B001GCZXW6,14067376,Soaring (Jazz Club),Music,5,0,1,N,Y,Five Stars,Excellent / thanks,2015-08-31,positive
3,US,16794688,R15LYP3O51UU9E,B00N1F0BKK,210426072,Pain Killer,Music,5,0,0,N,Y,Five Stars,Purchased as a gift and they loved this cd,2015-08-31,positive
4,US,32203364,R1AD7L0CC3DSRI,B00V7KAO7Q,360249504,A Thoughtiverse Unmarred,Music,5,0,0,N,Y,Definitely worth a listen.,Really enjoyed the content of this album. I b...,2015-08-31,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4439906,US,53065285,RCCO7QP25QFYU,B0000032CU,734573994,The Vogues - Greatest Hits,Music,5,30,31,N,N,Excellent! Melodic; superb vocal harmonies! A ...,The Vogues are a band that I recently rediscov...,1996-12-20,positive
4439907,US,53058447,R2K6VMIL048HD3,B0000000K4,126873550,Passages,Music,5,6,7,N,N,The upmost delightful sound collection you can...,The collaboration between the indian musician ...,1996-10-18,positive
4439908,US,52852984,R1ZOIUY0UMHPE4,B000001U6M,735790595,Like the Willow Tree,Music,5,4,4,N,N,Wonderful Celtic/American folk,Atwater-Donnelly (note the 'y') is a folk grou...,1996-07-07,positive
4439909,US,52152881,R4X9DR0KSYFXD,B000001FBU,880908728,Creatures of the Night,Music,4,1,1,N,N,Kiss' return to hard rock from Disco. One of t...,After briefly visiting the world of Disco and ...,1996-05-20,positive


In [5]:
df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'summary', 'reviewText', 'review_date', 'sentiment']

## Preproccesing
- using both the review and summary
- making the targets (0,1)
- dropping empty rows

### Train set

In [6]:
df = df.replace(np.nan, '', regex=True)

In [7]:
df['summary'] = df.reviewText + ' ' + df.summary

In [8]:
data = df[["summary","sentiment"]]

In [9]:
data = data.replace({"sentiment":{"positive":1,"negative":0}})

### Validation set

In [12]:
df_dev = json_to_df("data\\music_reviews_dev.json")

In [13]:
df_dev = df_dev.replace(np.nan, '', regex=True)

In [14]:
df_dev["summary"] = df_dev.reviewText + ' ' + df_dev.summary

In [15]:
data_dev = df_dev[["summary","sentiment"]]
data_dev = data_dev.replace({"sentiment":{"positive":1,"negative":0}})

## Pipeline
- words counts
- naive bayes

In [10]:
pipeline = Pipeline(steps=[
    ("TFIDFVectorizer_transform",TfidfVectorizer()),
    ("MultinomialNB", MultinomialNB())
])

In [11]:
pipeline.fit(data["summary"],data["sentiment"])

Pipeline(steps=[('TFIDFVectorizer_transform', TfidfVectorizer()),
                ('MultinomialNB', MultinomialNB())])

## Validation
Using f1 score to validate

In [16]:
preds = pipeline.predict(data_dev["summary"])

In [17]:
f1_score(preds, data_dev["sentiment"])

0.7318140852323934

In [19]:
import pickle

In [21]:
pickle.dump(pipeline, open("baseline_model.pickle","wb"))

In [16]:
mat = confusion_matrix(data_dev["sentiment"], preds)
sns.heatmap(mat.T, square = True, annot=True, fmt = "d",)
plt.xlabel("true labels")
plt.ylabel("predicted label")
plt.show()

NameError: name 'confusion_matrix' is not defined

In [None]:
df_test = json_to_df("data\\music_reviews_test_masked.json")

In [None]:
df_test = df_test.replace(np.nan, '', regex=True)

In [None]:
df_test["summary"] = df_test.reviewText + " " + df_test.summary

In [None]:
data_test = df_test[["summary","sentiment"]]

In [None]:
preds_test = pipeline.predict(data_test["summary"])

In [None]:
df_test

In [None]:
confusion_matrix(preds,data_dev["sentiment"])

In [None]:
pd.DataFrame(preds_test).to_csv("submission.csv")