In [1]:
import json
import os
import random
import numpy as np
import pandas as pd

In [2]:
data = None
with open(os.path.join('data', 'train.json'), 'r') as train_file:
    data = [json.loads(row) for row in train_file]

In [3]:
data_df = pd.DataFrame(data).drop(columns=['image'])
del data

In [4]:
test_data = None
with open(os.path.join('data', 'test.json'), 'r') as test_file:
    test_data = [json.loads(row) for row in test_file]

test_df = pd.DataFrame(test_data)
del test_data

In [5]:
categories = data_df['category'].unique()
dfs = []
for category in categories:
    dfs.append(data_df[data_df['category'] == category].sample(frac=0.25))
data_df = pd.concat(dfs, axis=0)
data_df = data_df.sort_index()

In [6]:
def trim_price(price):
    """Trims `price` to remove the $ sign.
    
    If the price variable does not have the format $x.xx
    then the empty string is returned.
    
    Parameters
    ----------
    price: str
        A string representing a price.
    
    Returns
    -------
    str
        A string representing `price` but with the $ sign removed,
        or the empty string if `price` does not have the correct
        format.
    
    """
    if (not pd.isnull(price) and isinstance(price, str) and
        len(price) > 0 and price[0] == '$'):
        return price[1:]
    return ""

In [7]:
from datetime import datetime

def preprocess(data_df, genres):
    data_df['reviewMonth'] = data_df['reviewTime'].apply(lambda x: x.split(' ')[0])
    data_df['reviewYear'] = data_df['reviewTime'].apply(lambda x: x.split(' ')[2])
    data_df['reviewHour'] = data_df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).hour)
    data_df['reviewMonthYear'] = data_df['reviewYear'] + '-' + data_df['reviewMonth']

    data_df['cleanedPrice'] = data_df['price'].apply(lambda x: trim_price(x))
    data_df = data_df[data_df['cleanedPrice'] != ""]
    data_df['cleanedPrice'] = data_df['cleanedPrice'].astype('float')

    data_df['fixedReviewText'] = np.where(pd.isnull(data_df['reviewText']), "", data_df['reviewText'])
    data_df['fixedSummary'] = np.where(pd.isnull(data_df['summary']), "", data_df['summary'])
    data_df['fullReviewText'] = data_df['fixedSummary'] + " " + data_df['fixedReviewText']

    data_df = data_df.drop(columns=['fixedReviewText', 'fixedSummary'])

    for genre in genres:
        genre_col = "is" + genre.replace(" ", "").replace("&", "")
        data_df[genre_col] = data_df['category'].apply(lambda x: 1 if x == genre else 0)

    data_df['reviewWordCount'] = data_df['fullReviewText'].apply(lambda x: len(x.split()))

    return data_df


music_categories = data_df['category'].unique()

data_df = preprocess(data_df, music_categories)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['cleanedPrice'] = data_df['cleanedPrice'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['fixedReviewText'] = np.where(pd.isnull(data_df['reviewText']), "", data_df['reviewText'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['fixedSummary'] = np.where(pd.is

In [8]:
def calculate_MSE(actuals, predicteds):
    """Calculates the Mean Squared Error between `actuals` and `predicteds`.

    Parameters
    ----------
    actuals: np.array
        A numpy array of the actual values.
    predicteds: np.array
        A numpy array of the predicted values.

    Returns
    -------
    float
        A float representing the Mean Squared Error between `actuals` and
        `predicteds`.

    """
    return (((actuals - predicteds)**2).sum()) / (len(actuals))

In [9]:
X_train = data_df.drop(columns=['overall'])
y_train = data_df['overall']
X_test = test_data_df

In [11]:
train_data = data_df

In [12]:
train_data['itemID'] = train_data['itemID'].astype("category")
train_data['reviewerID'] = train_data['reviewerID'].astype("category")

In [13]:
import scipy.sparse as sp

item_matrix = train_data.pivot(index='itemID', columns='reviewerID', values='overall')
item_matrix = item_matrix.fillna(0)
user_item_train_matrix = sp.csr_matrix(item_matrix.values)

In [14]:
global_average = train_data['overall'].mean()

In [15]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(user_item_train_matrix)
item_neighbors = np.asarray(model_knn.kneighbors(user_item_train_matrix, return_distance=False))

In [16]:
user_matrix = train_data.pivot(index='reviewerID', columns='itemID', values='overall')
user_matrix = user_matrix.fillna(0)
user_item_train_matrix = sp.csr_matrix(user_matrix.values)

In [17]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(user_item_train_matrix)
user_neighbors = np.asarray(model_knn.kneighbors(user_item_train_matrix, return_distance=False))

In [18]:
train_user_avg = train_data.groupby(train_data['reviewerID'], as_index=False)['overall'].mean()
train_item_avg = train_data.groupby(train_data['itemID'], as_index=False)['overall'].mean()
train_user_avg.columns = ['reviewerID', 'userAverage']
train_item_avg.columns = ['itemID', 'itemAverage']
train_user_avg = train_user_avg.set_index('reviewerID')
train_item_avg = train_item_avg.set_index('itemID')

In [19]:
item_avgs = []
for i in range(len(item_neighbors)):
    item_avgs.append(train_item_avg['itemAverage'][item_matrix.index[item_neighbors[i]]].mean())

item_avgs = pd.concat([pd.DataFrame(item_matrix.index, columns=['itemID']), pd.DataFrame(item_avgs, columns=['itemRating'])], axis=1)

In [20]:
user_avgs = []
for i in range(len(user_neighbors)):
    user_avgs.append(train_user_avg['userAverage'][user_matrix.index[user_neighbors[i]]].mean())

In [21]:
user_avgs = pd.concat([pd.DataFrame(user_matrix.index, columns=['reviewerID']), pd.DataFrame(user_avgs, columns=['userRating'])], axis=1)

In [22]:
def weighted_average_data(X, total_avg, user_avgs, item_avgs):
    """Calculates the error based on the weighted average prediction.

    Parameters
    ----------
    X: pd.DataFrame
        The DataFrame of features.
    total_avg: float
        The average across all users/items.
    user_avgs: pd.DataFrame
        A DataFrame containing the average rating for each user.
    item_avgs: pd.DataFrame
        A DataFrame containing the average rating for each item.

    Returns
    -------
    float
        A float representing the mean squared error of the predictions.

    """
    df_user = pd.merge(X, user_avgs, how='left', on=['reviewerID'])
    df_final = pd.merge(df_user, item_avgs, how='left', on=['itemID'])
    df_final = df_final[['userRating', 'itemRating']]
    df_final = df_final.fillna(total_avg)
    df_final.index = X.index
    return df_final

In [44]:
X_test_aug = weighted_average_data(test_df, global_average, user_avgs, item_avgs)
X_test_mod = pd.concat([test_df, X_test_aug], axis=1)

In [45]:
def threshold_rating(rating):
    """Thresholds `rating` to lie in the range [1, 5].

    Parameters
    ----------
    rating: float
        The rating to be thresholded.

    Returns
    -------
    float
        A float representing the thresholded rating.

    """
    if rating < 1:
        return 1
    if rating > 5:
        return 5
    return rating

In [46]:
X_test_mod['pred'] = (0.5 * X_test_mod['userRating']) + (0.5 * X_test_mod['itemRating'])
X_test_mod['pred'] = X_test_mod['pred'].apply(lambda x: threshold_rating(x))

In [47]:
X_test_mod['userID-itemID'] = X_test_mod['reviewerID'] + "-" + X_test_mod['itemID']
X_test_mod.index = X_test_mod['userID-itemID']

In [49]:
lang_preds = pd.read_csv(os.path.join("data", "rating_predictions_lang.csv"))

In [50]:
lang_preds.index = lang_preds['userID-itemID']
lang_preds.columns = ['userID-itemID', 'pred_lang']

In [51]:
test_final = pd.merge(lang_preds[['pred_lang']], X_test_mod[['pred']], how='left', left_index=True, right_index=True)

In [52]:
test_final['prediction'] = np.where(test_final['pred_lang'] < 0, test_final['pred'], test_final['pred_lang'])

In [59]:
test_final['prediction'] = test_final['prediction'].apply(lambda x: threshold_rating(x))
test_final['userID-itemID'] = test_final.index

In [62]:
test_final = test_final[['userID-itemID', 'prediction']]

In [64]:
test_final.to_csv(os.path.join("data", "rating_predictions.csv"), index=False)

In [65]:
test_final['prediction'] = test_final['prediction'].apply(lambda x: round(x))    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final['prediction'] = test_final['prediction'].apply(lambda x: round(x))


In [67]:
test_final.to_csv(os.path.join("data", "rating_predictions.csv"), index=False)