# Imports

In [1]:
import gzip
import json
import numpy as np
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline

import nltk
from nltk.corpus import stopwords, brown
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer


from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Parameters

In [2]:
subset_name = 'Magazine_Subscriptions'
min_amount_product_mentions = 4
min_amount_user_mentions = 4

STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace

# Data

In [3]:
def preprocess_metadata(sentence):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer() 
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in nltk.word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    print('Cleaning sentences...')
    df['clean_sentence'] = df['sentence'].apply(clean_text)
    df['tok_lem_sentence'] = df['clean_sentence'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df

def get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions, test_size=0.1, random_state=42):    
    if os.path.exists(os.path.join('data', f'meta_{subset_name}_preprocessed.txt')):
        meta_df = pd.read_csv(os.path.join('data', f'meta_{subset_name}_preprocessed.txt'))
    else:
        data = []
        with gzip.open(os.path.join('data', f'meta_{subset_name}.json.gz')) as f:
            for l in f:
                data.append(json.loads(l.strip()))
        meta_df = pd.DataFrame.from_dict(data)

        meta_df['category'] = meta_df['category'].apply(lambda x: ','.join(map(str, x)))
        meta_df['description'] = meta_df['description'].apply(lambda x: ','.join(map(str, x)))
        meta_df['feature'] = meta_df['feature'].apply(lambda x: ','.join(map(str, x)))
        meta_df['metadata'] = meta_df['category'] + ' ' + meta_df['description'] + ' ' + meta_df['title'] + ' ' + meta_df['feature']

        meta_df['metadata'] = meta_df['metadata'].map(lambda x:preprocess_metadata(x)) 
        meta_df.to_csv(os.path.join('data', f'meta_{subset_name}_preprocessed.txt'))
    
    if os.path.exists(os.path.join('data', f'{subset_name}_preprocessed.txt')):
        rating_df = pd.read_csv(os.path.join('data', f'{subset_name}_preprocessed.txt'))
    else:
        data = []
        with gzip.open(os.path.join('data', f'{subset_name}.json.gz')) as f:
            for l in f:
                data.append(json.loads(l.strip()))
        rating_df = pd.DataFrame.from_dict(data)
        rating_df = rating_df[['asin', 'reviewerID', 'overall', 'reviewText']]

        rating_df = rating_df.drop_duplicates()    
        rating_df = rating_df[rating_df['asin'].map(rating_df['asin'].value_counts()) >= min_amount_product_mentions]
        rating_df = rating_df[rating_df['reviewerID'].map(rating_df['reviewerID'].value_counts()) >= min_amount_user_mentions]
        rating_df = rating_df[~rating_df.reviewText.isna()]
        rating_df.rename(columns={'reviewText':'sentence'}, inplace=True)
        rating_df = clean_sentences(rating_df)
        rating_df.to_csv(os.path.join('data', f'{subset_name}_preprocessed.txt'))
    
    rating_df = pd.merge(rating_df, meta_df[['asin', 'metadata']], on='asin')
    
    # split rating df in half -> half is used for sentiment analysis and the other half for the recommender models
    X = rating_df.copy().drop(['overall'], axis=1)
    y = rating_df['overall']
    
    X_rm, X_sa, y_rm, y_sa = train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state)
        
    # split data of the sa half for the test set
    X_sa, X_test, y_sa, y_test = train_test_split(X_sa, y_sa, test_size=test_size, stratify=y_sa, random_state=random_state)
                        
    return rating_df, X_rm, X_sa, X_test, y_rm, y_sa, y_test, meta_df

In [4]:
 rating_df, X_rm, X_sa, X_test, y_rm, y_sa, y_test, meta_df = get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions)

# Helper Methods

In [5]:
def print_score(model, X, y):
    y_pred = model.predict(X)
    
    print(model.__class__.__name__)
    print(f'RMSE: {np.sqrt(mean_squared_error(y, y_pred))}')
    print(f'MAE: {mean_absolute_error(y, y_pred)}')
    print('')

In [6]:
def get_all_product_ids():
    return rating_df['asin'].unique()

In [7]:
def get_k_recommendations(model, user_id, k=5):    
    # only recommend items not reviewed by the user before
    not_rated_product_ids = [x for x in get_all_product_ids() if x not in set(rating_df[rating_df['reviewerID'] == 'AEJAGHLC675A7']['asin'].values)]
        
    user_product_pairs = pd.DataFrame({
        'asin': not_rated_product_ids,
        'reviewerID': [user_id]*len(not_rated_product_ids)
    })
    user_product_pairs = pd.merge(user_product_pairs, meta_df[['asin', 'metadata']], on='asin')
    user_product_pairs['predicted_rating'] = model.predict(user_product_pairs)
    return user_product_pairs.sort_values(by='predicted_rating', ascending=False).head(k)

In [8]:
def print_k_recommendations(model, user_id, k=5):
    print(f'User {user_id} has previously enjoyed:')
    previously_rated = rating_df[rating_df['reviewerID']==user_id].sort_values(by='overall',ascending=False).head(k)['asin'].values
    print_product_titles(previously_rated)
    print('')
    print('We now recommend him:')
    recommendations =  get_k_recommendations(model, user_id, k)['asin'].values
    print_product_titles(recommendations)
    print('')

In [9]:
def print_product_titles(ids):
    titles = meta_df[meta_df['asin'].isin(ids)]['title'].values
    for title in titles:
        print(title)

# Models

## Base Model

In [10]:
class BaseModel:
    
    def __init__(self, X_train, y_train):
        self._X_train = X_train.copy()
        self._y_train = y_train.copy()
        
    def predict(self, X):
        return [2.5] * len(X)        

## Content-based Filtering

In [11]:
class ContentBasedModel(BaseModel):
    
    def __init__(self, X_train, y_train, meta_df):
        self._pipeline = Pipeline([('Vectorizer', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('SVM classifier', SVC())])

        self._pipeline.fit(X_train['metadata'], y_train)
    
    def predict(self, X):
        return self._pipeline.predict(X['metadata'])

## Collab Filtering

In [12]:
class CollabWeightedAverageModel(BaseModel):
    
    def __init__(self, X_train, y_train, similarity='cosine'):
        BaseModel.__init__(self, X_train, y_train)
        
        self._X_train['overall'] = self._y_train
        self._user_product_matrix = pd.crosstab(self._X_train.reviewerID, self._X_train.asin, self._X_train.overall, aggfunc='max')
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results
            
    
    def _predict(self, X):
        if X['asin'] in self._user_product_matrix and X['reviewerID'] in self._user_product_matrix.index:
            sim_scores = self._similarity[X['reviewerID']] 
            ratings_scores = self._user_product_matrix[X['asin']] 

            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            sim_scores = sim_scores.drop(index_not_rated)

            if sim_scores.sum() != 0:
                return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [13]:
class CollabKnnModel(BaseModel):
    
    def __init__(self, X_train, y_train, similarity='cosine', k=10):
        BaseModel.__init__(self, X_train, y_train)
        
        self._X_train['overall'] = y_train
        self._user_product_matrix = pd.crosstab(self._X_train.reviewerID, self._X_train.asin, self._X_train.overall, aggfunc='max')
        
        self._k = k        
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
            
    def predict(self, X):
        results = np.array([])
        for i, row in X.iterrows():
            results = np.append(results, self._predict(row))
        return results
    
    def _knn_filtered(self, user_id, product_id, k):
        return self._similarity[user_id][~np.isnan(self._user_product_matrix[product_id])].sort_values(ascending=False).head(k)
    
    def _predict(self, X):
        if X['asin'] in self._user_product_matrix and X['reviewerID'] in self._user_product_matrix.index:
            neighbours = self._knn_filtered(X['reviewerID'], X['asin'], self._k)

            if not len(neighbours):
                return 2.5
            
            ratings = self._user_product_matrix[X['asin']][neighbours.index.values].to_numpy().astype(float)
            weights = neighbours.values.astype(float)
            
            if weights.sum() != 0:
                return np.dot(ratings, weights)/weights.sum()
        return 2.5

## Sentiment Analysis

In [14]:
class SentimentAnalysis:
    
    def __init__(self, X_train, y_train):
        self._pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC())])

        self._pipeline.fit(X_train['sentence'], y_train)
        
    
    def predict(self, X):
        return self._pipeline.predict(X['sentence'])

## Collab Filtering With Sentiment Analysis

In [15]:
class WeightedAverageModelSentimentAnalysis(CollabWeightedAverageModel):
    
    def __init__(self, sa_model, X_train, y_train, similarity='cosine'):
        review_text_ratings = sa_model.predict(X_train)
        y_train = review_text_ratings
        
        CollabWeightedAverageModel.__init__(self, X_train, y_train, similarity)

## Hybrid

In [16]:
class Hybrid:
    
    def __init__(self, content, collab, sentiment):
        self._content = content
        self._collab = collab
        self._sentiment = sentiment
        
    def predict(self, X):
        pred_content = self._content.predict(X)
        pred_collab = self._collab.predict(X)
        pred_sentiment = self._sentiment.predict(X)
        return (pred_content + pred_collab + pred_sentiment)/3

# Testing

In [17]:
X_rm.shape

(5201, 7)

In [18]:
X_sa.shape

(4680, 7)

In [19]:
X_test.shape

(521, 7)

In [20]:
base_model = BaseModel(X_rm, y_rm)
content_model = ContentBasedModel(X_rm, y_rm, meta_df)
collab_avg_model = CollabWeightedAverageModel(X_rm, y_rm)
collab_knn_model = CollabKnnModel(X_rm, y_rm, 'pearson', 10)
sa_model = SentimentAnalysis(X_sa, y_sa)
collab_sa_model = WeightedAverageModelSentimentAnalysis(sa_model, X_rm, y_rm)

NameError: name 'WeightedAverageModel' is not defined

In [None]:
print_score(sa_model, X_test, y_test)

In [None]:
print_score(base_model, X_test, y_test)
print_score(content_model, X_test, y_test)
print_score(collab_avg_model, X_test, y_test)
print_score(collab_knn_model, X_test, y_test)
print_score(collab_sa_model, X_test, y_test)

In [None]:
hybrid_model = Hybrid(content_model, collab_knn_model, collab_sa_model)
print_score(hybrid_model, X_test, y_test)

In [None]:
print_k_recommendations(hybrid_model, 'AE06RDYJF5SKY', 10)