In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

In [2]:
# load the data, keep only selected columns
# Read the file using super fast Pandas.read_csv
def load_required_data(path, required_columns, start_index, end_index):
    dataframe = pd.read_json(path, lines=True)
    dataframe = dataframe[required_columns]
    return dataframe, dataframe.values

In [3]:
dataframe, values = load_required_data('../reviews_Apps_for_Android_5.json', ["asin", "reviewerID", "overall"], 0, 20000)

In [4]:
# Create user-item matrix given a matrix and mapping from original matrix columns to new matrix rows/columns
def create_user_item_matrix(data, rowMapping, columnMapping):
    # Create table of users to items by taking row and column pivots
    # Rows of user-item table are users
    rows, row_pos = np.unique(data[:, rowMapping], return_inverse=True)
    # Columns of user-item table are movies
    cols, col_pos = np.unique(data[:, columnMapping], return_inverse=True)

    pivot_table = np.zeros((len(rows), len(cols)), dtype=data.dtype)
    pivot_table[row_pos, col_pos] = data[:, 2]
    return pivot_table, rows, cols

In [5]:
# user item matrix with ratings
ratings_matrix, rows, cols = create_user_item_matrix(values, rowMapping=1, columnMapping=0)

In [6]:
# get the unique rating count
# grab the ratings column from the dataset
ratings = values[:, 2]
unique_ratings, counts_elements = np.unique(ratings, return_counts=True)
print("Frequency of unique ratings:")
rating_frequency = np.asarray((unique_ratings, counts_elements))
print rating_frequency

Frequency of unique ratings:
[[1 2 3 4 5]
 [78713 44385 85121 158081 386637]]


In [7]:
# total number of ratings available
total_ratings = counts_elements.sum()
print total_ratings

752937


In [8]:
# get ratings percentage
ratings_distribution = np.true_divide(rating_frequency[1], total_ratings)

overall_rating = np.asarray((unique_ratings, ratings_distribution * 100))
print overall_rating

[[1 2 3 4 5]
 [10.454128300242916 5.894915510859474 11.305195521006405
  20.995249270523296 51.3505113973679]]


In [9]:
# Display percentage count of each rating
overall_rating_distribution = zip(unique_ratings,ratings_distribution*100)
print overall_rating_distribution

[(1, 10.454128300242916), (2, 5.894915510859474), (3, 11.305195521006405), (4, 20.995249270523296), (5, 51.3505113973679)]


In [20]:
# plot the pie chart 

def plotPieChart(labels, sizes):
    
    colors = ['#FFA48C', '#F3AEC9', '#71CD99', '#82CADD', '#FBD267']
    # explode = (0, 0, 0, 0, 0.1)  # explode 1st slice

    # Plot
    plt.figure(figsize=(6, 6), dpi=200)
    plt.pie(sizes, labels=labels, colors=colors,
            autopct='%1.1f%%', shadow=False, startangle=140)
    plt.rcParams['font.size'] = 14.0
    plt.axis('equal')
    plt.savefig('piechart.png', bbox_inches='tight')
    #plt.show()
    plt.close()
        
    
plotPieChart(overall_rating[0], overall_rating[1])

In [11]:
# average rating
average_rating = np.true_divide(ratings.sum(), len(ratings))

print "Average rating: ", average_rating

Average rating:  3.968930999539138


In [12]:
# ratings per item
ratings_per_item = np.count_nonzero(ratings_matrix, axis=0)
print ratings_per_item

[17 62 96 ...  5  7 17]


In [13]:
print "total number of items: ", len(ratings_per_item)        
print "highest number of ratings for an item: ", ratings_per_item.max()         
print "smallest number of ratings for an item: ", ratings_per_item.min()         

total number of items:  13209
highest number of ratings for an item:  6282
smallest number of ratings for an item:  5


In [14]:
# ratings per user
ratings_per_user = np.count_nonzero(ratings_matrix, axis=1)
print ratings_per_user

print "total number of users: ", len(ratings_per_user)        
print "highest number of ratings by a user: ", ratings_per_user.max()         
print "smallest number of ratings by a user: ", ratings_per_user.min() 


[12  5 18 ...  5  5  9]
total number of users:  87271
highest number of ratings by a user:  565
smallest number of ratings by a user:  5


In [34]:
# def plotHistogram(x, printingString, x_axis_range, xlabel, title):
#     #import matplotlib.transforms as mtrans
#     num_bins = 55
#     n, bins, patches = plt.hist(x, 30, range=x_axis_range, facecolor='blue', alpha=0.5)
#     # plt.xticks(range(0, 1000))
#     # plt.yticks(range(1, 8000))

#     plt.axvline(x.mean(), color='k', linestyle='dashed', linewidth=1)
#     plt.xlabel(xlabel)
#     plt.ylabel('Frequency')
#     plt.title(title)
#     plt.show() 

#     print printingString, x.mean()


def plotHistogram(x, printingString, x_axis_range, xlabel, title, ylabel):
    #import matplotlib.transforms as mtrans
    num_bins = 55
    plt.figure(figsize=(6, 6), dpi=200)
    n, bins, patches = plt.hist(x, bins=None, range=x_axis_range, facecolor='#ffb145', alpha=0.9)
    plt.rcParams['font.size'] = 14.0
    plt.axvline(x.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True)
    plt.savefig('histogram_'+title+ '.png', bbox_inches='tight')
    #plt.show() 

    print printingString, x.mean()
    

In [35]:
plotHistogram(ratings_per_user, "Average #ratings/user: ", [4, 30], 'Number of ratings', '#ratings per user', '# of users')
#plotHistogram(ratings_per_item, "Average #ratings/item: ", [0, 250], 'Number of ratings', '#ratings per item', '# of items')

Average #ratings/user:  8.627573879066356


In [27]:
# items with more ratings have good average rating
movie_avg_ratings = np.true_divide(ratings_matrix.sum(0), (ratings_matrix!=0).sum(0))

In [30]:
plt.figure(figsize=(6, 6), dpi=200)
plt.plot(movie_avg_ratings, ratings_per_item,  'ro', color='#ffb145')
plt.xlabel('Average item rating')
plt.ylabel('Averaging rating count')
plt.title('Average item rating by item count')
plt.grid(True)
plt.savefig('scatter_plot.png', bbox_inches='tight')
#plt.show()