In [1]:
%run helper.ipynb

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [3]:
class SentimentAnalysis:
    
    def __init__(self, X_train, y_train):
        self._pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC())])

        self._pipeline.fit(X_train['sentence'], y_train)
        
    
    def predict(self, X):
        return self._pipeline.predict(X['sentence'])

In [4]:
class UserCollabSentimentWeightedAverageModel:
    
    def __init__(self, sa_model, X_train, y_train, similarity='cosine'):
        X_train['overall'] = sa_model.predict(X_train)
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results
            
    
    def _predict(self, X):
        if X['asin'] in self._user_product_matrix and X['reviewerID'] in self._user_product_matrix.index:
            sim_scores = self._similarity[X['reviewerID']] 
            ratings_scores = self._user_product_matrix[X['asin']] 

            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            sim_scores = sim_scores.drop(index_not_rated)

            if sim_scores.sum() != 0:
                return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [5]:
X_train, y_train = load_train(subset_name)
X_train_sa, y_train_sa = load_train_sa(subset_name)
X_test, y_test = load_test(subset_name)

In [6]:
sa_model = SentimentAnalysis(X_train_sa, y_train_sa)
y_pred_sa = sa_model.predict(X_test)
print_score(y_test, y_pred_sa)
model = UserCollabSentimentWeightedAverageModel(sa_model, X_train, y_train)
y_pred = model.predict(X_test)
print_score(y_test, y_pred)

RMSE: 0.9478271254814521
MAE: 0.45716685330347145

RMSE: 1.1897934065837557
MAE: 0.790019501857464



In [7]:
save_preds(model, subset_name, y_pred)

In [None]:
%run helper.ipynb
for u in user_ids:
    save_preds(model, subset_name, model.predict(get_user_pred_data(u, subset_name)), f=u)