## Data for Content Based System (Pattern Metadata) 

Using ravelry.com's api.  Limit of 100k per call.  As of Friday November 19, 665,465 knitting patterns. Api does not currently allow for sort-by searches directly.  Will need to segment and pull out in sequential chunks - then filter down after the fact. 

**get_all_patterns()** function found at the bottom of this notebook consolidates all of the other functions - in order to call and save each segmented portion of the data. 

* get pattern list (ids) fitting within query criteria
* then iterate through through patterns ids and pull metadata for each 
* if <100k, split into segments
* save as .csv's to later combine into one monster dataframe 

In [48]:
# load the libraries
import pandas as pd
import numpy as np

import json
import requests
from requests.auth import HTTPBasicAuth
from pprint import pprint

from config import basic_auth_username, basic_auth_password
from config import basic_auth_username_read_only, basic_auth_password_read_only

It's a two step process to get the knitting pattern metadata.  The pattern id's can be pulled from **get_search_results** using a query, then using the pattern_ids from the json response, **get_pattern_details** is used to access the details such as project count (how many of each pattern were knit), average rating, needle sizes. 

The get_search_results function below was used 3 times: 
* First for all patterns that were 5 star rated and clothing
* Then for all patterns that were 5 stars not clothing (as there were over 100k of 5 star patterhs)
* 4 stars (just over 75k patterns)

In [30]:
def get_search_results(page):
    """ gets json response for patterns with search parameters - (will be used to 
    can extract pattern ids from this list, in order to get details in subsequent functions) """
    try:
#         #5stars and NOT clothing (worked!)
#         response =requests.get(f'https://api.ravelry.com/patterns/search.json?craft=knitting&ratings=5&pc=accessories%7Cmedical%7Chome%7Ctoysandhobbies%7Cpet&photo=yes&sort=best&page_size=500&page={page}',auth=HTTPBasicAuth(basic_auth_username, basic_auth_password))
        
#         #5starrs and IS clothing 
#         response =requests.get(f'https://api.ravelry.com/patterns/search.json?craft=knitting&ratings=5&pc=clothing&photo=yes&sort=best&page_size=100&page={page}',auth=HTTPBasicAuth(basic_auth_username, basic_auth_password))

        #4stars all patterhns
        response =requests.get(f'https://api.ravelry.com/patterns/search.json?craft=knitting&ratings=4&photo=yes&sort=best&page_size=100&page={page}',auth=HTTPBasicAuth(basic_auth_username, basic_auth_password))

        search_results = response.json()
    except:
        print(f'page number {page} failed')
        print(response)
    return search_results


def get_pattern_details(search_results):
    """ parses pattern id's out of the json request from get_search_results and uses to call for pattern details (where the 
    pattern metadata can be found """
    ids = []
    for i in range(len(search_results['patterns'])):
        ids.append(search_results['patterns'][i]['id'])

    id_list = [str(x) for x in ids]
    spaced_ids = '+'.join(id_list)
    spaced_ids

    response =requests.get(f'https://api.ravelry.com/patterns.json?ids={spaced_ids}', auth=HTTPBasicAuth(basic_auth_username, basic_auth_password))
    return response.json()

### Parse out Pattern Details
#### For a singular result:

In [414]:
# # For one result:  
# response =requests.get('https://api.ravelry.com/patterns/573.json',auth=HTTPBasicAuth(basic_auth_username, basic_auth_password))
# patterns = response.json()
# # pprint(patterns)

#### For multiple results:

In [5]:
def parse_json_response_into_df(pattern):
    """ takes response from get_pattern_details and parses out metadata details, returns as a dataframe.  Note this 
    only works for multiple pattern calls at once """
    
#     instantiate lists
    id_ = []
    name = []
    favorites_count = []
    projects_count = []
    difficulty_average = []
    difficulty_count = []
    rating_average = []
    rating_count = []
    pattern_type_id = []
    pattern_type_names = []
    pattern_type_clothing = []
    yarn_weight = []
    photos_url =[]
    craft_id = []
    yarn_weight = []
    pattern_needle_sizes = []
    yardage = []
    yarn_weight_description = []
    yardage_max = []
    yardage = []
    gauge = []
    gauge_divisor = []
    free = []
    downloadable = []
    queued_projects_count =[]
    attributes = []
    generally_available = []
    yarn_weight_name = []
    yardage_description =[]
    pattern_attributes = []
    categories = []
    name_permalink =[]

    # parse json response 
    for i in pattern['patterns']:
        id_.append(pattern['patterns'][i]['id'])
        name.append(pattern['patterns'][i]['name'])
        name_permalink.append(pattern['patterns'][i]['permalink'])
        favorites_count.append(pattern['patterns'][i]['favorites_count'])
        projects_count.append(pattern['patterns'][i]['projects_count'])
        difficulty_average.append(pattern['patterns'][i]['difficulty_average'])
        difficulty_count.append(pattern['patterns'][i]['difficulty_count'])
        queued_projects_count.append(pattern['patterns'][i]['queued_projects_count'])
        rating_average.append(pattern['patterns'][i]['rating_average'])
        rating_count.append(pattern['patterns'][i]['rating_count'])
        downloadable.append(pattern['patterns'][i]['downloadable'])
        free.append(pattern['patterns'][i]['free'])
        gauge_divisor.append(pattern['patterns'][i]['gauge_divisor'])
        gauge.append(pattern['patterns'][i]['gauge'])
        yardage.append(pattern['patterns'][i]['yardage'])
        yardage_max.append(pattern['patterns'][i]['yardage_max'])
        yarn_weight_description.append(pattern['patterns'][i]['yarn_weight_description'])
        generally_available.append(pattern['patterns'][i]['generally_available'])
        yardage_description.append(pattern['patterns'][i]['yardage_description'])
        try:
            photos_url.append(pattern['patterns'][i]['photos'][0]['square_url'])
        except:
            photos_url.append(None)  
        try:   
            pattern_type_clothing.append(pattern['patterns'][i]['pattern_type']['clothing'])
            pattern_type_names.append(pattern['patterns'][i]['pattern_type']['permalink'])
        except:
            pattern_type_clothing.append(None)
            pattern_type_names.append(None)
        try:
            pattern_needle_sizes.append(pattern['patterns'][i]['pattern_needle_sizes'])
        except:
            pattern_needle_sizes.append(None)

        attributes = []
        try:
            for j in range(len(pattern['patterns'][i]['pattern_attributes'])):
                attributes.append(pattern['patterns'][i]['pattern_attributes'][j]['permalink'])
            pattern_attributes.append(attributes)
        except:
            pattern_attributes.append('None')

        try:
            category_dict = pattern['patterns'][i]['pattern_categories'][0]
            category_list = []
            category_list = [category_dict['permalink']]
            new_dict = category_dict['parent']
            while 'parent' in new_dict.keys():
                category_list.append(new_dict['permalink'])
                new_dict = new_dict['parent']
            categories.append(category_list)
        except:
            categories.append(None)
            print("uhoh - check out categories!")
            
    # assemble dictionary          
    data = {'pattern_id':id_,
            'name':name,
            'name_permalink':name_permalink,
            'favorites_count': favorites_count,
            'projects_count': projects_count, 
            'difficulty_average' : difficulty_average, 
            'difficulty_count': difficulty_count, 
            'rating_average': rating_average,
            'queued_projects_count': queued_projects_count,
            'rating_count':rating_count,
            'pattern_type_names' :pattern_type_names,
            'pattern_type_clothing' :pattern_type_clothing,
            'photos_url' :photos_url,
            'pattern_needle_sizes' :pattern_needle_sizes,
            'pattern_attributes':pattern_attributes,
            'yardage_max' :yardage_max,
            'yardage' :yardage,
            'generally_available':generally_available,
            'gauge' :gauge,
            'gauge_divisor' :gauge_divisor,
            'free' :free,
            'downloadable': downloadable,
            'categories':categories,
            'yarn_weight_description' :yarn_weight_description,   
           }
    
    return pd.DataFrame(data)   

In [31]:
def filter_and_save_csv(df):
    """ doesn't currently filter, just saves (appends) csv at the moment """
    df.to_csv('data/patterns_4star.csv', mode ="a", index=False)

In [23]:
def get_all_patterns():
    """ Consolidates other fuctions - iterates through pages of paginator to get pattern ids, then call for get 
    pattern details, parse response, and apend to .csv """
    page = 1
    while page < 784:
        try:
            search_results = get_search_results(page)
            json_data = get_pattern_details(search_results)
            new_parsed_patterns = parse_json_response_into_df(json_data)
            print('yay parsed pattterns page {}!'.format(page))

            filter_and_save_csv(new_parsed_patterns)
            print('yay saved!')

            page += 1

        except Exception as e:
                print(e, 'Stopped on page {} -retrying!'.format(page))

# get_all_patterns()