In [2]:
%run helper.ipynb

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [24]:
class ContentBasedWeightedAverageModel:
    
    def __init__(self, meta_df, X_train, y_train):        
        X_train['overall'] = y_train
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        tfidf = TfidfVectorizer(smooth_idf = False, sublinear_tf = True)
        tfidf_matrix = tfidf.fit_transform(meta_df['metadata'])
        content_correlation = cosine_similarity(tfidf_matrix, tfidf_matrix)
        self._similarity = pd.DataFrame(content_correlation, index=meta_df.asin.tolist(), columns=meta_df.asin.tolist())
        self._similarity = self._similarity.drop_duplicates()
        self._similarity = self._similarity.loc[:,~self._similarity.columns.duplicated()]
        
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results            
    
    def _predict(self, X):
        if X['asin'] in self._similarity and X['asin'] in self._similarity.index and X['reviewerID'] in self._user_product_matrix.index:
            ratings_scores = self._user_product_matrix.loc[X['reviewerID']] 
            ratings_scores = ratings_scores.dropna()
            
            if len(ratings_scores.index.values) > 0:
                sim_scores = self._similarity[X['asin']]
                if all(i in sim_scores.index.values for i in ratings_scores.index.values):
                    sim_scores = sim_scores.loc[ratings_scores.index.values]
                    if sim_scores.sum() != 0:
                        return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [4]:
X_train, y_train = load_train(subset_name)
X_test, y_test = load_test(subset_name)
meta = load_meta(subset_name)

In [26]:
model = ContentBasedWeightedAverageModel(meta, X_train, y_train)
y_pred = model.predict(X_test)
print_score(y_test, y_pred)

RMSE: 2.013653702055771
MAE: 1.872760358342665



In [27]:
save_preds(model, subset_name, y_pred)

In [5]:
X_train

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,reviewerID,sentence,clean_sentence,tok_lem_sentence,metadata
0,2444,194387,B0000025P9,A1IKOYZVFHO1XP,"In the spring of 1982,Toto graced radio airwav...",in the spring of 1982 toto graced radio airwav...,"['in', 'the', 'spring', 'of', '1982', 'toto', ...",toto
1,2968,231193,B0000026WD,A2PV6GK1HV54Y9,Dangerous by Michael Jackson could easily be h...,dangerous by michael jackson could easily be h...,"['dangerous', 'by', 'michael', 'jackson', 'cou...",dangerous
2,23691,2732064,B0169PLOV8,A1HCCW38EQQBTY,"Yes, he's still explicit. Yes, he and Trey So...",yes he still explicit yes he and trey son...,"['yes', 'he', 'still', 'explicit', 'yes', 'he'...",span parentaladvisory class size medium color ...
3,4204,309611,B000002H33,A2C7BOQVFH1HLE,Metallica is boldly and appropriately named as...,metallica is boldly and appropriately named as...,"['metallica', 'is', 'boldly', 'and', 'appropri...",master puppets
4,14170,1078731,B00006HCUT,A3O8YT41TDXL0B,...or those who don't mind buying an album sim...,or those who don t mind buying an album sim...,"['or', 'those', 'who', 'don', 't', 'mind', 'bu...",whitney
...,...,...,...,...,...,...,...,...
11899,8164,549727,B000003BEU,A26NLSTT75FMJM,These Stones role doobies! This album should s...,these stones role doobies this album should s...,"['these', 'stone', 'role', 'doobies', 'this', ...",satanic majesties request
11900,15715,1180942,B00009WHRT,A200C7YQJ45LRR,"Now, people that know me know that I hardly ev...",now people that know me know that i hardly ev...,"['now', 'people', 'that', 'know', 'me', 'know'...",span parentaladvisory class size medium color ...
11901,9142,716286,B00000F1D3,AWPODHOB4GFWL,"""Believe"" was one of those instant classic son...",believe was one of those instant classic son...,"['believe', 'wa', 'one', 'of', 'those', 'insta...",believe
11902,10489,821878,B00003TFN7,A1JIW8GOSSGUQR,Judas Priest enjoyed most of its success throu...,judas priest enjoyed most of its success throu...,"['juda', 'priest', 'enjoyed', 'most', 'of', 'i...",sad wings destiny
