This notebook focuses on collecting and preparing data from the Raverly API.
he main goals are to:
- fetch publicly available knitting pattern data using the Ravelry API,
- build an initial raw dataset containing pattern metadata,
- clean and structure the data for further analysis,
- prepare the dataset for exploratory data analysis (EDA) and later modelling.

This is the first step of the project. I will concentrate on data collection and basic data preparation for further analysis.

1. Import libraries for data handling, API calls, and quick visual checks

In [19]:
# BASIC DATA HANDLING
import pandas as pd
import numpy as np
import os
import json
import time

# API & REQUESTS
import requests
from requests.auth import HTTPBasicAuth

# VISUALISATION (later use)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Settings for nicer plots
sns.set(style="whitegrid")



2. Define and create folders for raw and processed data.

In [20]:
RAW_DIR = "../data/raw/v1"
PROCESSED_DIR = "../data/processed/v1"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

RAW_JSON_PATH = os.path.join(RAW_DIR, "patterns_raw.json")
RAW_CSV_PATH = os.path.join(RAW_DIR, "patterns_raw.csv")

PROCESSED_CSV_PATH = os.path.join(PROCESSED_DIR, "patterns_clean.csv")

RAW_DIR, PROCESSED_DIR


('../data/raw/v1', '../data/processed/v1')

3. API Credentials. They are loaded from enviroment variables. They are stored locally as environment variables to prevent accidental exposure in the codebase.


In [21]:
from dotenv import load_dotenv

load_dotenv()  # loads variables from .env

RAVELRY_USER = os.getenv("RAVELRY_ACCESS_KEY")
RAVELRY_PASS = os.getenv("RAVELRY_PERSONAL_KEY")

if not RAVELRY_USER or not RAVELRY_PASS:
    raise ValueError(
        "API credentials not found. "
        "Make sure RAVELRY_ACCESS_KEY and RAVELRY_PERSONAL_KEY are set in .env file."
    )

print("Credentials loaded ✅")



Credentials loaded ✅


4. Test public API endpoint (read-only) and inspect response structure

In [22]:
search_url = f"{BASE_URL}/patterns/search.json" # Endpoint for pattern search
params = {"query": "sweater", "page": 1, "page_size": 5}

resp = requests.get(search_url, params=params, auth=auth, timeout=30) # Make the GET request

print("Status:", resp.status_code)
print("Content-Type:", resp.headers.get("Content-Type"))
print("Preview:", resp.text[:200])

if resp.ok:
    data = resp.json()
    print("Top-level keys:", list(data.keys()))
    print("Patterns returned:", len(data.get("patterns", [])))



Status: 200
Content-Type: application/json; charset=utf-8
Preview: {"patterns": [{"free":false,"id":7498393,"name":"Elza Sweater","permalink":"elza-sweater","personal_attributes":null,"first_photo":{"id":145528451,"sort_order":1,"user_id":8335554,"x_offset":0,"y_offs
Top-level keys: ['patterns', 'paginator']
Patterns returned: 5


5. Fetch patterns from Raverly API (RAW), and save raw response to JSON

In [23]:
search_url = f"{BASE_URL}/patterns/search.json" # Endpoint for pattern search

query = "sweater"   # Search term
page = 1
page_size = 100   
max_pages = 3     

all_patterns = []

while page <= max_pages:
    print(f"Fetching page {page}...")
    
    params = {
        "query": query,
        "page": page,
        "page_size": page_size
    }
    
    resp = requests.get(search_url, params=params, auth=auth, timeout=30)
    
    if not resp.ok:
        print(f"Stopped at page {page}, status {resp.status_code}")
        break
    
    data = resp.json()
    patterns = data.get("patterns", [])
    
    all_patterns.extend(patterns)
    
    if len(patterns) < page_size:
        # no more pages
        break
    
    page += 1
    time.sleep(1)  

print(f"Total patterns collected: {len(all_patterns)}")

# Save RAW data to JSON
with open(RAW_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(all_patterns, f, ensure_ascii=False, indent=2)

RAW_JSON_PATH


Fetching page 1...
Fetching page 2...
Fetching page 3...
Total patterns collected: 300


'../data/raw/v1\\patterns_raw.json'

6. Load raw pattern data from JSON and inspect

In [25]:
with open(RAW_JSON_PATH, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

print("Type:", type(raw_data))
print("Number of patterns:", len(raw_data))

# Inspect first pattern
print("Type of first item:", type(raw_data[0]))
print("Keys of first pattern:", raw_data[0].keys())


Type: <class 'list'>
Number of patterns: 300
Type of first item: <class 'dict'>
Keys of first pattern: dict_keys(['free', 'id', 'name', 'permalink', 'personal_attributes', 'first_photo', 'designer', 'pattern_author', 'pattern_sources'])


7. Convert raw pattern list to DataFrame 

In [None]:
df_raw = pd.DataFrame(raw_data)

print(df_raw.shape)
df_raw.head()


(300, 9)


Unnamed: 0,free,id,name,permalink,personal_attributes,first_photo,designer,pattern_author,pattern_sources
0,False,7498393,Elza Sweater,elza-sweater,,"{'id': 145528451, 'sort_order': 1, 'user_id': ...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
1,True,7498528,Sway Sweater,sway-sweater,,"{'id': 145540274, 'sort_order': 1, 'user_id': ...","{'crochet_pattern_count': 123, 'favorites_coun...","{'crochet_pattern_count': 123, 'favorites_coun...","[{'amazon_rating': None, 'amazon_reviews': Non..."
2,True,1255578,Step by Step Sweater,step-by-step-sweater,,"{'id': 115040664, 'sort_order': 1, 'user_id': ...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
3,False,7497807,Ridgewood Sweater,ridgewood-sweater,,"{'id': 145448666, 'sort_order': 1, 'user_id': ...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."
4,False,7497838,BABA sweater Chunky,baba-sweater-chunky,,"{'id': 145451206, 'sort_order': 1, 'user_id': ...","{'crochet_pattern_count': 0, 'favorites_count'...","{'crochet_pattern_count': 0, 'favorites_count'...","[{'amazon_rating': None, 'amazon_reviews': Non..."


8. Fetch detailed pattern data from Ravelry API

In [28]:
# Prepare list of pattern IDs (for detail fetch)

pattern_ids = df_raw["id"].dropna().astype(int).unique().tolist()
print("Unique pattern IDs:", len(pattern_ids))
pattern_ids[:10]

# Fetch details for ONE pattern (test)

pattern_id = pattern_ids[0]
detail_url = f"{BASE_URL}/patterns/{pattern_id}.json"

resp = requests.get(detail_url, auth=auth, timeout=30)
print("Status:", resp.status_code)
print("Content-Type:", resp.headers.get("Content-Type"))
print("Preview:", resp.text[:200])

detail = resp.json()
detail.keys()



Unique pattern IDs: 300
Status: 200
Content-Type: application/json; charset=utf-8
Preview: {"pattern": {"comments_count":9,"created_at":"2026/01/26 09:20:43 -0500","currency":"GBP","difficulty_average":6.0,"difficulty_count":6,"downloadable":true,"favorites_count":725,"free":false,"gauge":2


dict_keys(['pattern'])

In [None]:
# Inspect detailed pattern keys (what fields are available)
data = resp.json()
pattern = data.get("pattern", {})

print("Top-level keys:", list(data.keys()))
print("Pattern keys (first 50):", list(pattern.keys())[:50])


Top-level keys: ['pattern']
Pattern keys (first 50): ['comments_count', 'created_at', 'currency', 'difficulty_average', 'difficulty_count', 'downloadable', 'favorites_count', 'free', 'gauge', 'gauge_divisor', 'gauge_pattern', 'gauge_repeats', 'generally_available', 'has_uk_terminology', 'has_us_terminology', 'id', 'name', 'pdf_url', 'permalink', 'price', 'projects_count', 'published', 'queued_projects_count', 'rating_average', 'rating_count', 'row_gauge', 'updated_at', 'url', 'yardage', 'yardage_max', 'yarn_list_type', 'personal_attributes', 'sizes_available', 'product_id', 'unlisted_product_ids', 'currency_symbol', 'ravelry_download', 'download_location', 'pdf_in_library', 'volumes_in_library', 'gauge_description', 'yarn_weight_description', 'yardage_description', 'pattern_needle_sizes', 'notes_html', 'notes', 'languages', 'packs', 'printings', 'yarn_weight']


In [30]:
# Find craft/category-related fields (scan keys)

keywords = ["craft", "category", "categories", "pattern_type", "type"]
matches = [k for k in pattern.keys() if any(word in k.lower() for word in keywords)]
print("Possible craft/category keys:", matches)


Possible craft/category keys: ['yarn_list_type', 'craft', 'pattern_categories', 'pattern_type']


In [31]:
# Create a structured record for one pattern (example)
record = {
    "pattern_id": pattern.get("id"),
    "name": pattern.get("name"),
    "permalink": pattern.get("permalink"),
    "published": pattern.get("published"),
    "free": pattern.get("free"),
    "price": pattern.get("price"),
    "currency": pattern.get("currency"),
    "yarn_weight": pattern.get("yarn_weight"),
    "yarn_weight_description": pattern.get("yarn_weight_description"),
    "gauge": pattern.get("gauge"),
    "row_gauge": pattern.get("row_gauge"),
    "gauge_description": pattern.get("gauge_description"),
    "yardage": pattern.get("yardage"),
    "yardage_max": pattern.get("yardage_max"),
    "yardage_description": pattern.get("yardage_description"),
    "projects_count": pattern.get("projects_count"),
    "favorites_count": pattern.get("favorites_count"),
    "queued_projects_count": pattern.get("queued_projects_count"),
    "difficulty_average": pattern.get("difficulty_average"),
    "rating_average": pattern.get("rating_average"),
    "rating_count": pattern.get("rating_count"),
    "downloadable": pattern.get("downloadable"),
}

pd.DataFrame([record])


Unnamed: 0,pattern_id,name,permalink,published,free,price,currency,yarn_weight,yarn_weight_description,gauge,...,yardage,yardage_max,yardage_description,projects_count,favorites_count,queued_projects_count,difficulty_average,rating_average,rating_count,downloadable
0,7498393,Elza Sweater,elza-sweater,2026/01/01,False,7.0,GBP,"{'crochet_gauge': None, 'id': 10, 'knit_gauge'...",Sport / 5 ply (12 wpi),26.0,...,2187,3784,2187 - 3784 yards,32,725,65,6.0,5.0,7,True


In [32]:
# Explore yarn_weight field structure
yw = pattern.get("yarn_weight")
print("yarn_weight type:", type(yw))
print("yarn_weight keys:", list(yw.keys()) if isinstance(yw, dict) else None)

record2 = {
    "pattern_id": pattern.get("id"),
    "published": pattern.get("published"),
    "free": pattern.get("free"),
    "price": pattern.get("price"),
    "currency": pattern.get("currency"),
    "yarn_weight_id": yw.get("id") if isinstance(yw, dict) else None,
    "yarn_weight_description": pattern.get("yarn_weight_description"),
    "gauge": pattern.get("gauge"),
    "row_gauge": pattern.get("row_gauge"),
    "gauge_description": pattern.get("gauge_description"),
    "yardage": pattern.get("yardage"),
    "yardage_max": pattern.get("yardage_max"),
    "projects_count": pattern.get("projects_count"),
    "favorites_count": pattern.get("favorites_count"),
    "queued_projects_count": pattern.get("queued_projects_count"),
}

pd.DataFrame([record2])


yarn_weight type: <class 'dict'>
yarn_weight keys: ['crochet_gauge', 'id', 'knit_gauge', 'max_gauge', 'min_gauge', 'name', 'ply', 'wpi']


Unnamed: 0,pattern_id,published,free,price,currency,yarn_weight_id,yarn_weight_description,gauge,row_gauge,gauge_description,yardage,yardage_max,projects_count,favorites_count,queued_projects_count
0,7498393,2026/01/01,False,7.0,GBP,10,Sport / 5 ply (12 wpi),26.0,32.0,26 stitches and 32 rows = 4 inches in stranded...,2187,3784,32,725,65


In [33]:
# Find craft/category-related fields (scan keys)

keywords = ["craft", "category", "categories", "pattern_type", "type"]
matches = [k for k in pattern.keys() if any(word in k.lower() for word in keywords)]
print("Possible craft/category keys:", matches)


Possible craft/category keys: ['yarn_list_type', 'craft', 'pattern_categories', 'pattern_type']


9. Create final structured records for all patterns

In [39]:

records = []

for pattern in raw_data:  # lista 100 patternów
    record = {
        "pattern_id": pattern.get("id"),
        "name": pattern.get("name"),
        "permalink": pattern.get("permalink"),
        "published": pattern.get("published"),
        "free": pattern.get("free"),
        "price": pattern.get("price"),
        "currency": pattern.get("currency"),

        "craft": pattern.get("craft"),
        "pattern_categories": json.dumps(
            pattern.get("pattern_categories"),
            ensure_ascii=False
        ),

        "yarn_weight": pattern.get("yarn_weight"),
        "yarn_weight_description": pattern.get("yarn_weight_description"),

        "gauge": pattern.get("gauge"),
        "row_gauge": pattern.get("row_gauge"),
        "gauge_description": pattern.get("gauge_description"),

        "yardage": pattern.get("yardage"),
        "yardage_max": pattern.get("yardage_max"),
        "yardage_description": pattern.get("yardage_description"),

        "projects_count": pattern.get("projects_count"),
        "favorites_count": pattern.get("favorites_count"),
        "queued_projects_count": pattern.get("queued_projects_count"),

        "difficulty_average": pattern.get("difficulty_average"),
        "rating_average": pattern.get("rating_average"),
        "rating_count": pattern.get("rating_count"),

        "downloadable": pattern.get("downloadable"),
    }

    records.append(record)

df = pd.DataFrame(records)
df.to_csv(PROCESSED_CSV_PATH, index=False)

df.head()


Unnamed: 0,pattern_id,name,permalink,published,free,price,currency,craft,pattern_categories,yarn_weight,...,yardage,yardage_max,yardage_description,projects_count,favorites_count,queued_projects_count,difficulty_average,rating_average,rating_count,downloadable
0,7498393,Elza Sweater,elza-sweater,,False,,,,,,...,,,,,,,,,,
1,7498528,Sway Sweater,sway-sweater,,True,,,,,,...,,,,,,,,,,
2,1255578,Step by Step Sweater,step-by-step-sweater,,True,,,,,,...,,,,,,,,,,
3,7497807,Ridgewood Sweater,ridgewood-sweater,,False,,,,,,...,,,,,,,,,,
4,7497838,BABA sweater Chunky,baba-sweater-chunky,,False,,,,,,...,,,,,,,,,,


10. Save basic CSV

In [40]:
csv_path = os.path.join(PROCESSED_DIR, "patterns_basic.csv")
df.to_csv(csv_path, index=False)

csv_path

'../data/processed/v1\\patterns_basic.csv'