## <span style="color:brown"><strong> Creating a Data Pipeline to Monitor Local Crime Trends </strong></span>

In [None]:
# # Install prefect if missing
# %pip install prefect
# Import necessary libraries
from dotenv import load_dotenv
import pandas as pd
from sodapy import Socrata
import os
import logging
import datetime as dt
from datetime import timedelta
from prefect import task


In [None]:
# Dataset ID for different datasets
dataeset_ids = [
    {"name": "Incident_Data", "id": "3gki-wyrb"},
    {"name": "Budget_Salaries", "id": "ixg8-tyau"},
    {"name": "Crime_Reports", "id": "xuad-73uj"},
    {"name": "Computer_Aided_Dispatch_Entries", "id": "2z9k-mv9g"},
    {"name": "Housing_Median_Sales_Prices", "id": "9nnq-4isb"},
    {"name": "Building_Permits_Addition_Alteration", "id": "qu2z-8suj"}
]

# Crime Reports:- https://data.cambridgema.gov/api/v3/views/xuad-73uj/query.json
# Budget - Salaries:- https://data.cambridgema.gov/api/v3/views/ixg8-tyau/query.json
# Computer Aided Dispatch Entries:- https://data.cambridgema.gov/api/v3/views/ppai-cur6/query.json
# Commonwealth Connect Service Requests:- https://data.cambridgema.gov/api/v3/views/2z9k-mv9g/query.json
# Housing Median Sales Prices:- https://data.cambridgema.gov/api/v3/views/9nnq-4isb/query.json
# Building Permits: Addition/Alteration:- https://data.cambridgema.gov/api/v3/views/qu2z-8suj/query.json

In [None]:
# Create a Function to Extract the data reported in the Cambridge Police Department for Diffrent Categories 
# @task(retries=3, retry_delay_seconds=[10,10,10]) # Retry the task up to 3 times with a delay of 10 seconds between each retry
# def extract_data_from_api(dataset_id: int, data_category: str, limit=2000):
def extract_data_from_api():
    f'''
    Extracts diffrent categories of data reported in the Cambridge Police department using the Socrata Open API.
    Return the extrcated data as a Pandas DataFrame

    Args:
        dataset_id (_type_): _description_
        data_category (_type_): _description_
        limit (int, optional): _description_. Defaults to 2000.

    Returns:
        _type_: _description_
    '''
    # Load environment variables from .env file
    load_dotenv()
    # Retrieve API credentials from environment variables
    SITE_LINK = os.getenv('Site_Link')
    USER_NAME = os.getenv('User_Name')
    PASSWORD = os.getenv('Password')
    App_Token = os.getenv('App_Token')
    timeout_sec = 30

    # client autunhentication
    client = Socrata(
                 'data.cambridgema.gov',
                 App_Token,
                 username=USER_NAME,
                 password=PASSWORD,
                 timeout=timeout_sec
                 )
    
    all_results = []
    offset = 0

    while True:
        results = client.get(dataset_id, limit=limit, offset=offset)
        if len(results) == 0:
            break
        all_results.extend(results)
        
        offset += limit

    print(f'Total Records retrieved for {data_category} is {len(all_results)}')

    return pd.DataFrame.from_records(all_results)  

In [None]:
# for dataset in dataeset_ids[0:1]:
#     id = dataset['id']
#     name = dataset['name']
#     print(id, name)
df = extract_data_from_api()


Total Records retrieved for Housing_Median_Sales_Prices is 23


Unnamed: 0,year,single_family,two_family,three_family,condominium
0,2002,649500,619750,743750,359750
1,2003,660000,611250,735000,359000
2,2004,647000,652250,760000,375000
3,2005,725000,709000,752500,419500
4,2006,795000,687000,705000,420000
5,2007,650000,650000,790570,416500
6,2008,790000,711000,827500,419500
7,2009,682500,620000,845000,415000
8,2010,760000,705000,822000,424000
9,2011,742500,696125,772500,422000
