# Imports

In [7]:
import gzip
import json
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split

import re

import nltk
from nltk.corpus import stopwords, brown
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer

In [8]:
if not os.path.exists('data'):
    %cd ..

In [9]:
%run final_models_split/helper.ipynb

# Data

In [10]:
STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace

def preprocess_metadata(sentence):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer() 
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in nltk.word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    df['clean_sentence'] = df['sentence'].apply(clean_text)
    df['tok_lem_sentence'] = df['clean_sentence'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df

def get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions, test_size=0.3, random_state=42):    
    if os.path.exists(os.path.join('data', f'{subset_name}_preprocessed.txt')):
        print('Loading preprocessed rating data...')
        rating_df = pd.read_csv(os.path.join('data', f'{subset_name}_preprocessed.txt'))
    else:
        print('Loading rating data...')
        data = []
        with gzip.open(os.path.join('data', f'{subset_name}.json.gz')) as f:
            for l in f:
                data.append(json.loads(l.strip()))
        rating_df = pd.DataFrame.from_dict(data)
        
        print('Preprocessing rating_df')
        rating_df = rating_df[['asin', 'reviewerID', 'overall', 'reviewText']]
        rating_df = rating_df.drop_duplicates()    
        rating_df = rating_df[rating_df['asin'].map(rating_df['asin'].value_counts()) >= min_amount_product_mentions]
        rating_df = rating_df[rating_df['reviewerID'].map(rating_df['reviewerID'].value_counts()) >= min_amount_user_mentions]
        rating_df = rating_df[~rating_df.reviewText.isna()]
        rating_df = rating_df[~rating_df.overall.isna()]
        print(rating_df.shape)
        rating_df.rename(columns={'reviewText':'sentence'}, inplace=True)
                
        print('Cleaning sentences...')
        rating_df = clean_sentences(rating_df)
        
        print('Saving rating_df...')
        rating_df.to_csv(os.path.join('data', f'{subset_name}_preprocessed.txt'))
    
    if os.path.exists(os.path.join('data', f'meta_{subset_name}_preprocessed.txt')):
        print('Loading preprocessed meta data...')
        meta_df = pd.read_csv(os.path.join('data', f'meta_{subset_name}_preprocessed.txt'))
    else:
        print('Loading metadata...')
        data = []
        with gzip.open(os.path.join('data', f'meta_{subset_name}.json.gz')) as f:
            for l in f:
                data.append(json.loads(l.strip()))
        meta_df = pd.DataFrame.from_dict(data)
        
        meta_df.head(10)
        
        print('Preprocessing metadata')
        meta_df = meta_df[meta_df['asin'].isin(rating_df['asin'].unique())]        
        meta_df['category'] = meta_df['category'].astype(str).apply(lambda x: ','.join(map(str, x)))
        meta_df['description'] = meta_df['description'].astype(str).apply(lambda x: ','.join(map(str, x)))
        meta_df['feature'] = meta_df['feature'].astype(str).apply(lambda x: ','.join(map(str, x)))
        meta_df['tech1'] = meta_df['tech1'].astype(str).apply(lambda x: ','.join(map(str, x)))
        meta_df['metadata'] = meta_df['category'] + ' ' + meta_df['description'] + ' ' + meta_df['title'] + ' ' + meta_df['feature']+ ' ' + meta_df['tech1']

        print('Cleaning metadata...')
        meta_df['metadata'] = meta_df['metadata'].map(lambda x:preprocess_metadata(x))
               
        print('Saving meta_df...')
        meta_df.to_csv(os.path.join('data', f'meta_{subset_name}_preprocessed.txt'))   
    
    meta_df = meta_df[meta_df['metadata'].notnull()]
    rating_df = pd.merge(rating_df, meta_df[['asin', 'metadata']], on='asin')
    rating_df = rating_df[rating_df['metadata'].notnull()]
    # split rating df in half -> half is used for sentiment analysis and the other half for the recommender models
    X = rating_df.drop(['overall'], axis=1)
    y = rating_df['overall']
    
    X_rm, X_sa, y_rm, y_sa = train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state)
        
    # split data of the sa half for the test set
    X_sa, X_test, y_sa, y_test = train_test_split(X_sa, y_sa, test_size=test_size, stratify=y_sa, random_state=random_state)
                        
    return rating_df, X_rm, X_sa, X_test, y_rm, y_sa, y_test, meta_df

In [11]:
rating_df, X_rm, X_sa, X_test, y_rm, y_sa, y_test, meta_df = get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions)

Loading preprocessed rating data...
Loading preprocessed meta data...


In [12]:
meta_df.to_csv(os.path.join('final_models_split', f'meta_{subset_name}.txt'))
X_rm.to_csv(os.path.join('final_models_split', f'X_train_{subset_name}.txt'))
y_rm.to_csv(os.path.join('final_models_split', f'y_train_{subset_name}.txt'))
X_sa.to_csv(os.path.join('final_models_split', f'X_train_sa_{subset_name}.txt'))
y_sa.to_csv(os.path.join('final_models_split', f'y_train_sa_{subset_name}.txt'))
X_test.to_csv(os.path.join('final_models_split', f'X_test_{subset_name}.txt'))
y_test.to_csv(os.path.join('final_models_split', f'y_test_{subset_name}.txt'))