In [9]:
%run helper.ipynb

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [11]:
class SentimentAnalysis:
    
    def __init__(self, X_train, y_train):
        self._pipeline = Pipeline([('vect', CountVectorizer(max_df=0.5, max_features=50000, ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', LogisticRegression(C=100, penalty='l2'))])

        self._pipeline.fit(X_train['sentence'], y_train)        
    
    def predict(self, X):
        return self._pipeline.predict(X['sentence'])

In [12]:
class UserCollabSentimentWeightedAverageModel:
    
    def __init__(self, sa_model, X_train, y_train, similarity='cosine'):
        X_train['overall'] = sa_model.predict(X_train)
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results
            
    
    def _predict(self, X):
        if X['asin'] in self._user_product_matrix and X['reviewerID'] in self._user_product_matrix.index:
            sim_scores = self._similarity[X['reviewerID']] 
            ratings_scores = self._user_product_matrix[X['asin']] 

            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            sim_scores = sim_scores.drop(index_not_rated)

            if sim_scores.sum() != 0:
                return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [13]:
X_train, y_train = load_train(subset_name)
X_train_sa, y_train_sa = load_train_sa(subset_name)
X_test, y_test = load_test(subset_name)

In [14]:
sa_model = SentimentAnalysis(X_train_sa, y_train_sa)
y_pred_sa = sa_model.predict(X_test)
print_score(y_test, y_pred_sa)
model = UserCollabSentimentWeightedAverageModel(sa_model, X_train, y_train)
y_pred = model.predict(X_test)
print_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RMSE: 0.7680270968530732
MAE: 0.35750279955207165

RMSE: 1.1363303884671383
MAE: 0.7812746726824031



In [15]:
save_preds(model, subset_name, y_pred)

In [16]:
%run helper.ipynb
for u in user_ids:
    save_preds(model, subset_name, model.predict(get_user_pred_data(u, subset_name)), f=u)