In [1]:
import pandas as pd, numpy as np
import nltk

from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
train = pd.read_csv('datasets/Train.csv')

In [3]:
test = pd.read_csv('datasets/Test.csv')

In [4]:
def get_uncommon_words(stringlist, max):
    uncommon_words= {}
    seen = {""}
    
    for string in stringlist:
        for word in nltk.word_tokenize(string):
            count = uncommon_words.get(word)
            if word not in seen: 
                seen.add(word)
                uncommon_words.update({word: 1})
            # If it's already been seen the max number of times, take it out of uncommon_words
            elif count == max: 
                uncommon_words.pop(word)
            # If it hasn't been seen max times, increment its count in uncommon_words
            elif count is not None:
                uncommon_words.update({word: count+1})
    return uncommon_words.keys()
            
    

In [5]:
train.head()

Unnamed: 0,reviewerID,amazon-id,helpful,unixReviewTime,reviewText,overall,reviewTime,summary,price,categories,root-genre,title,artist,label,first-release-year,songs,salesRank,related
0,-4984057859803657856,1877521326299865484,"[2, 2]",1302739200,Very nice music for practicing my Tai Chi. I d...,4,"04 14, 2011",Beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
1,9136764282801708742,1877521326299865484,"[11, 11]",1180396800,I recently starting doing Tai Chi which I love...,5,"05 29, 2007",Tranquillity In Motion !!!,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
2,2164551966908582519,1877521326299865484,"[0, 0]",1361404800,My wife uses it for her class room the kids lo...,5,"02 21, 2013",Great Stuff,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
3,-7309200698931694843,1877521326299865484,"[4, 4]",1338163200,We bought this music to go Dr Lam DVD. The mus...,5,"05 28, 2012",Beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
4,-4461682407031037732,1877521326299865484,"[0, 0]",1396310400,It helps me do my exercise because it sets the...,5,"04 1, 2014",tai chi music,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."


In [6]:
"""
handle_text: takes out stop words and punctuation, lemmatizes, and converts to lowercase
* input: string
* output: hanlded string
"""
def handle_text(text):
    # Check that argument provided is a string
    if not isinstance(text,str):
        return ""
    output = []
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in nltk.word_tokenize(text): 
        # Disclude stop words and words with punctuation
        word = word.lower()
        if word not in stop_words and word.isalpha():
            # Add lemmatized words
            output.append(wordnet_lemmatizer.lemmatize(word))
    return " ".join(output) # return the list of words as one single string

In [7]:
"""
convert_text: applies handle_text to all summaries and reviews in dataframe
* input: dataframe
* output: new dataframe with converted summaries/reviews
"""
def convert_text(df):
    df = df.assign(summary=df['summary'].apply(handle_text))
    df = df.assign(reviewText=df['reviewText'].apply(handle_text))
    return df

In [8]:
"""
get_uncommon_words: gets all words that occur less than the 'max' times in the data
* inputs: list_of_strings to get the uncommon words from
         max: integer of the max number of times a word can occur and still be considered 'uncommon'
* output: dictionary key set of uncommon words
"""
def get_uncommon_words(list_of_strings, max):
    uncommon_words = {} # dictionary of uncommon words where key=word, item=frequency
    seen = {""} # set of 'seen' words to avoid re-adding words to uncommon_words
    for string in list_of_strings:
        for word in nltk.word_tokenize(string): 
            count = uncommon_words.get(word)
            # If we haven't seen the word yet, add it to seen and uncommon_words
            if word not in seen: 
                seen.add(word)
                uncommon_words.update({word: 1})
            # If it's already been seen the max number of times, take it out of uncommon_words
            elif count == max: 
                uncommon_words.pop(word)
            # If it hasn't been seen max times, increment its count in uncommon_words
            elif count is not None:
                uncommon_words.update({word: count+1})
    return uncommon_words.keys() # return only the words, not the counts

In [9]:
"""
remove_words: removes all words in provided list from a given string
* inputs: string (to remove words from)
          words_to_remove: list of strings indicating which words to remove
* output: string (without provided words)
"""
def remove_words(string, words_to_remove):
    list_of_words = nltk.word_tokenize(string)
    for word in list_of_words: 
        # remove all words that occur in words_to_remove
        if word in words_to_remove: 
            list_of_words.remove(word)
    return " ".join(list_of_words) # return as a string

In [10]:
"""
remove_words_summaries: removes all words in provided list from the dataframe's summaries
* inputs: dataframe (to remove words from)
          words_to_remove: list of strings indicating which words to remove
* output: new dataframe (without the given words)
"""
def remove_words_reviews(df, words_to_remove):
    return df.assign(reviewText=df['reviewText'].apply(remove_words, words_to_remove=words_to_remove))

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [12]:
def prep_df(df):
    # Convert to lowercase, remove punctuation and stop words, lemmatize words
    df = convert_text(df)
    uncommon_words = get_uncommon_words(df['reviewText'], 2)
     # Remove uncommon words
    return remove_words_reviews(df, uncommon_words)

In [13]:
train_new = prep_df(train)

In [14]:
train_new.head()

Unnamed: 0,reviewerID,amazon-id,helpful,unixReviewTime,reviewText,overall,reviewTime,summary,price,categories,root-genre,title,artist,label,first-release-year,songs,salesRank,related
0,-4984057859803657856,1877521326299865484,"[2, 2]",1302739200,nice music practicing tai chi downloaded phone...,4,"04 14, 2011",beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
1,9136764282801708742,1877521326299865484,"[11, 11]",1180396800,recently starting tai chi love adding cd pract...,5,"05 29, 2007",tranquillity motion,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
2,2164551966908582519,1877521326299865484,"[0, 0]",1361404800,wife us class room kid love loved price great ...,5,"02 21, 2013",great stuff,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
3,-7309200698931694843,1877521326299865484,"[4, 4]",1338163200,bought music go dr lam dvd music perfect give ...,5,"05 28, 2012",beautiful,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."
4,-4461682407031037732,1877521326299865484,"[0, 0]",1396310400,help exercise set proper mood happy quality pl...,5,"04 1, 2014",tai chi music,16.47,"['CDs & Vinyl', 'New Age']",New Age,-3267874170410107454,-7180760356347753735,Cdbaby/Cdbaby,,"[7058439142327364074, 6037075874942075284, 852...",27222,"{'also_bought': [-404470919165672227, 11968160..."


In [22]:
def awesome_products(df):
    df = df.groupby('amazon-id').agg({'overall': lambda x: 1 if np.mean(x) > 4.5 else 0})
    return df['overall']

In [29]:


def combine_summaries(train_df, test_df):
#    df, vestorizer
    #aggregate reviewTest based on amazon-id
    combined_train_df = train_df.groupby('amazon-id').agg({'reviewText': ' '.join, 'summary': ' '.join})
    vectorizer = TfidfVectorizer(max_features=8000)
    
    #transform it into an np.array then fit it into matrix
    review_vector = np.array(vectorizer.fit_transform(combined_train_df['reviewText']).toarray())
    #saved_train_matrix = vectorizer.fit(combined_train_df['reviewText'])
    summary_vector = np.array(vectorizer.transform(combined_train_df['summary']).toarray())
    
    
    train_vector = review_vector + summary_vector
    #train_vector = np.array(vectorizer.fit_transform(combined_train_df['reviewText']).toarray())
    
    combined_test_df = test_df.groupby('amazon-id').agg({'reviewText': ' '.join, 'summary': ' '.join})

    #transform it into an np.array then fit it into matrix
    review_vector = np.array(vectorizer.fit_transform(combined_test_df['reviewText']).toarray())
    #saved_test_matrix = vectorizer.fit(combined_test_df['reviewText'])
    summary_test_vector = np.array(vectorizer.transform(combined_test_df['summary']).toarray())

    #combine review and summary
    test_vector = np.array(vectorizer.fit_transform(combined_test_df['reviewText']).toarray())
    print("Test vector length: ", len(test_vector))
    print("Train vector length: ", len(train_vector))
    return train_vector, test_vector

In [35]:
combined_train_df = train_new.groupby('amazon-id').agg({'reviewText': ' '.join, 'summary': ' '.join, 'salesRank': lambda x: np.mean(x), 'price': lambda x: np.mean(x)})

In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

ct_opts = [('reviewText_bow', TfidfVectorizer(max_features = 4000), 'reviewText'),
          ('summary_bow', TfidfVectorizer(max_features = 4000), 'summary'),
          ('salesRank_norm', MinMaxScaler(), ['salesRank']),
          ('price_norm', MinMaxScaler(), ['price'])]
ct = ColumnTransformer(ct_opts, remainder = 'drop')

In [48]:
train_vec = np.array(ct.fit_transform(combined_train_df).toarray())

In [49]:
print(train_vec)

[[0.         0.         0.         ... 0.         0.53410938 0.01719522]
 [0.         0.         0.         ... 0.         0.12598976 0.0836289 ]
 [0.         0.         0.         ... 0.         0.11423985 0.02755755]
 ...
 [0.         0.         0.         ... 0.         0.24462838 0.02463662]
 [0.         0.         0.         ... 0.         0.1105543  0.02783573]
 [0.         0.         0.         ... 0.         0.24037958 0.02303707]]


In [43]:
def make_prediction(x_train, y_train, x_test):
    model = LinearSVC()
    model.fit(x_train, y_train)
    return model, model.predict(x_test)

In [53]:
train_vec.shape

(10543, 8002)

In [54]:
aw = awesome_products(train_new)

In [55]:
aw.shape

(10543,)

In [59]:
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [57]:
x_train, x_test, y_train, y_test = train_test_split(train_vec, aw)

In [60]:
model, y_pred = make_prediction(x_train, y_train, x_test)

In [61]:
print("f1_score=", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

f1_score= 0.7349936143039592
              precision    recall  f1-score   support

           0       0.63      0.59      0.61      1101
           1       0.72      0.75      0.73      1535

    accuracy                           0.69      2636
   macro avg       0.68      0.67      0.67      2636
weighted avg       0.68      0.69      0.68      2636

