In [1]:
import os
import json
import shutil
import pandas as pd

In [2]:
def checkIntegrity(start_path, files, end_path):
    good = len(files)
    for file in files:
        path = start_path + file + end_path
        statinfo = os.stat(path)
        if statinfo.st_size < 1:
            print("Problem with file: " + file)
            good = good - 1
            shutil.rmtree(start_path + file + "/")
            print("Delete file : " + file)
        else:
            try:
                with open(path) as data_file:    
                    data = json.load(data_file)
            except:
                print("Error not json file: " + file)
                good = good -1
                shutil.rmtree(start_path + file + "/")
                print("Delete file : " + file)
    print("Good: " + str(good))

In [3]:
def getWrongEmptyPhotos(start_path, files, end_path):
    empty = []
    for file in files:
        path = start_path + file + end_path
        statinfo = os.stat(path)
        if statinfo.st_size == 2:
            with open(start_path + file + "/data.json") as data_file:    
                data = json.load(data_file)
                if data["photos_count"] != "0":
                    empty.append(file)
                    shutil.rmtree(start_path + file + "/")
                    print("Delete file : " + file)
    return empty

In [6]:
recipes = [f.path.split("/")[1] for f in os.scandir("recipe_data/") if f.is_dir() ]    
print(recipes[:10])
print("Nb of recipes: " + str(len(recipes)))

['229804', '245348', '25093', '14930', '257743', '255991', '236394', '13148', '18454', '215435']
Nb of recipes: 44069


In [7]:
checkIntegrity("recipe_data/", recipes, "/photos.json")

Good: 44069


In [8]:
checkIntegrity("recipe_data/", recipes, "/reviews.json")

Good: 44069


In [6]:
empties = getWrongEmptyPhotos("recipe_data/", recipes, "/photos.json")

FileNotFoundError: [Errno 2] No such file or directory: 'data/13045/photos.json'

In [29]:
len(empties)

0

In [9]:
def getAllUsersWithReviews(start_path, files):
    users = set()
    for file in files:
        try:
            with open(start_path + file + "/reviews.json") as data_file:    
                reviews = json.load(data_file)
                for review in reviews:
                    users.add(review["submitter"]["userID"])
        except: 
            print("Problem with file: " + file)
    return users

In [10]:
users = getAllUsersWithReviews("recipe_data/", recipes)

In [11]:
len(users)

1017178

In [18]:
dfUsers = pd.DataFrame(list(users))
dfUsers.columns = ["user_id"]
dfUsers.to_csv("users.csv")

In [45]:
dfRecipe = pd.DataFrame(recipes)
dfUsers = pd.DataFrame(list(users))

In [47]:
dictRecipes = {k: v for v, k in enumerate(recipes)}
dictUsers = {k: v for v, k in enumerate(users)}

In [50]:
import numpy as np
from scipy.sparse import dok_matrix
M = dok_matrix((len(users), len(recipes)), dtype=np.float32)
for recipe in recipes:
    with open("data/" + recipe + "/reviews.json") as data_file:    
        reviews = json.load(data_file)
        for review in reviews:
            user = review["submitter"]["userID"]
            M[dictUsers[user], dictRecipes[recipe]] = review["rating"]

In [51]:
M.shape

(1015449, 33696)

In [59]:
nonz = M.nonzero()

In [61]:
len(nonz[0])

3149810

In [104]:
temp = pd.DataFrame(np.array(M[dictUsers[337534]].todense()).transpose())
temp.columns = ["rating"]
temp[temp["rating"] > 0].describe()

Unnamed: 0,rating
count,88.0
mean,4.488636
std,0.625039
min,2.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [102]:
recipes[33606]

'7211'

In [3]:
users = [f.path.split("/")[1] for f in os.scandir("user_data/") if f.is_dir() ]    
len(users)

246299