In [15]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds 

In [7]:
customers=pd.read_csv("../Data Preparation/终版数据/customers_final_version.csv")

In [8]:
customers.head(2)

Unnamed: 0,reviewer_id,listing_id,date,comments,price,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,polarity_score,sentiment
0,8557223,49091,2013-10-21,Fran was absolutely gracious and welcoming. Ma...,81.0,94.0,10.0,10.0,10.0,10.0,8.0,8.0,0.9615,pos
1,1356099,50646,2014-04-18,A comfortable room in a smart condo developmen...,80.0,91.0,9.0,10.0,10.0,10.0,9.0,9.0,0.9484,pos


In [9]:
customers.shape

(130602, 14)

In [17]:
def recommend_predictions(df_rec, k):
    """
    :param df_rec: dataframe with polarity values for known reviewer/listing pairs
    :param k: number of features to keep for SVD
  
    returns the dataframe with predicted polarity values for all (user,item) pairs
    """
    # get utility matrix
    util_mat = df_rec.pivot_table(index='reviewer_id', columns='listing_id', values='polarity_score')
  
    # keep track of reviewers and listings
    reviewer_rows = list(util_mat.index)
    reviewer_index = {reviewer_rows[i]: i for i in range(len(reviewer_rows))}
    listing_cols = list(util_mat.columns)
    listing_index = {listing_cols[i]: i for i in range(len(listing_cols))}
  
    # mask NaN and remove means
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)
    item_means = np.mean(masked_arr, axis=0) 
    util_mat = masked_arr.filled(item_means)
    means = np.tile(item_means, (util_mat.shape[0],1))
    util_mat_demeaned = util_mat - means
  
    # run SVD
    U, sigma, Vt = svds(util_mat_demeaned, k = k)
    sigma = np.diag(sigma)
    
    all_predicted_polarity = np.dot(np.dot(U, sigma), Vt) + means
  
    return all_predicted_polarity, reviewer_index, listing_index

In [29]:
all_predicted_polarity, users_index, items_index = recommend_predictions(df.iloc[:8000,:],10)

In [30]:
pd.DataFrame(all_predicted_polarity)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,209,210,211,212,213,214,215,216,217,218
0,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
1,0.9615,0.931690,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
2,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
3,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
4,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7662,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813395,0.946107
7663,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
7664,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107
7665,0.9615,0.931689,0.9486,0.8488,0.863539,0.644969,0.83805,0.938767,0.9186,0.92247,...,0.9135,0.740722,0.594155,0.705185,0.840983,0.831029,0.776842,0.904295,0.813443,0.946107


In [32]:
listing_id_array = df['listing_id'].unique()

In [33]:
listing_id_array

array([   49091,    50646,    56334, ..., 40138508, 40226189, 17929049])

In [34]:
def get_recommendations(predMat, user, N):
    """
    :param predMat: matrix of full predicted polarity sentiment values
    :param user: selected reviewer_id
    :param N: top N recommendations to show
    
    returns top N recommendations for specified user
    """
    u_index = users_index[user]
    item_i = [items_index[listing_id_array[i]] for i in range(len(listing_id_array))]
    
    pred_user = [predMat[u_index, i_index] for i_index in item_i]
    
    d = {'listing_id': listing_id_array, 'predicted_polarity': pred_user}
    user_rec = pd.DataFrame(data=d)
    user_rec.sort_values(by=['predicted_polarity'], ascending=False, inplace=True)
    user_rec.reset_index(inplace=True, drop=True)
    
    return user_rec[:N]

In [41]:
get_recommendations(all_predicted_polarity, 78074, 10)

KeyError: 3856693