Data Source: 
* NYC Open Data API for restaurants data
* https://dev.socrata.com/foundry/data.cityofnewyork.us/43nn-pn8j
* NewsAPI for articles (json files)
* https://newsapi.org/

Technologies:
1. **Request API** from NewsAPI to get "food" related articles & NYC Open Data API for restaurants data
2. Store articles data and restaurants data in **MongoDB**
3. **Flask** to get user input cuisine and zipcode, output restaurants and articles.

Cannot be run on Google Collab because localhost

# NYC Restaurants data

Import data from Socrata Open Data API, give it a minute to load.

In [1]:
#pip install sodapy

In [2]:
# Ignore warning
from sodapy import Socrata
import pandas as pd

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# Returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("43nn-pn8j", limit=300000)

# Convert to pandas DataFrame
restaurants = pd.DataFrame.from_records(results)



### Clean data

* Columns dba(restaurant name), cuisine_description, and zipcode contain missing values
* Lowercase cuisine_description for easier search
* Duplicate restaurant records, drop duplicates

In [3]:
# Ignore warning
columns_to_check = ['dba', 'cuisine_description', 'zipcode']
df = restaurants.dropna(subset=columns_to_check)

df['zipcode'] = df['zipcode'].astype(int).astype(str)
df.loc[:,'cuisine_description'] = df['cuisine_description'].str.lower()

restaurants_cleaned = df.drop_duplicates(subset=['dba'], keep='first')  #Keeps only the first occurrence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['zipcode'] = df['zipcode'].astype(int).astype(str)


# Articles: 

Since this API permits request as far back as one month ago, calculate date of one month ago

In [4]:
#pip install python-dateutil

In [5]:
import datetime
from dateutil.relativedelta import relativedelta

# Get one month ago's date
today_date = datetime.date.today()
one_month_ago = today_date - relativedelta(months=1)
one_month_ago_str = one_month_ago.strftime('%Y-%m-%d')

In [6]:
import requests

# Keywords for now: dining, cuisine, restaurant, recipe. 
# Add more or modify to improve search results.
url = (
    f'https://newsapi.org/v2/everything?'
    f'q=(dining OR cuisine OR restaurant OR recipe)&'
    f'from={one_month_ago_str}&'
       'apiKey=7152e173864048d6934a8df325418c66')

response = requests.get(url)
data = response.json()
articles = data.get('articles', [])

# Load data into MongoDB

In [7]:
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.proj5400
collection_articles = db.articles_data 
collection_restaurants = db.restaurants_data

In [8]:
# Insert articles data into MongoDB
if articles:
    collection_articles.insert_many(articles)
else:
    print("No data to insert.")

In [9]:
# Insert restaurants data into MongoDB
restaurants_dict = restaurants_cleaned.to_dict('records')
collection_restaurants.insert_many(restaurants_dict)

InsertManyResult([ObjectId('672a7e51f7ade027d8d3c643'), ObjectId('672a7e51f7ade027d8d3c644'), ObjectId('672a7e51f7ade027d8d3c645'), ObjectId('672a7e51f7ade027d8d3c646'), ObjectId('672a7e51f7ade027d8d3c647'), ObjectId('672a7e51f7ade027d8d3c648'), ObjectId('672a7e51f7ade027d8d3c649'), ObjectId('672a7e51f7ade027d8d3c64a'), ObjectId('672a7e51f7ade027d8d3c64b'), ObjectId('672a7e51f7ade027d8d3c64c'), ObjectId('672a7e51f7ade027d8d3c64d'), ObjectId('672a7e51f7ade027d8d3c64e'), ObjectId('672a7e51f7ade027d8d3c64f'), ObjectId('672a7e51f7ade027d8d3c650'), ObjectId('672a7e51f7ade027d8d3c651'), ObjectId('672a7e51f7ade027d8d3c652'), ObjectId('672a7e51f7ade027d8d3c653'), ObjectId('672a7e51f7ade027d8d3c654'), ObjectId('672a7e51f7ade027d8d3c655'), ObjectId('672a7e51f7ade027d8d3c656'), ObjectId('672a7e51f7ade027d8d3c657'), ObjectId('672a7e51f7ade027d8d3c658'), ObjectId('672a7e51f7ade027d8d3c659'), ObjectId('672a7e51f7ade027d8d3c65a'), ObjectId('672a7e51f7ade027d8d3c65b'), ObjectId('672a7e51f7ade027d8d3c6

# Flask input and output

In [10]:
from flask import Flask, request, render_template
app = Flask("Interactive App")

@app.route('/', methods=['GET'])
def my_form():
    return render_template("search_form.html")

@app.route('/', methods=['POST'])
def search_articles():
    search_term = request.form['search_term']
    filter_term = request.form['filter_term']  

    article_query = {
        '$or': [
            {'title': {'$regex': search_term, '$options': 'i'}},
            {'description': {'$regex': search_term, '$options': 'i'}},
            {'content': {'$regex': search_term, '$options': 'i'}}
        ]
    }
    results = collection_articles.find(article_query)
    articles = list(results)
    
    restaurant_query = {
        '$and':[
            {'cuisine_description': {'$regex': search_term}},
            {'zipcode': {'$regex': filter_term}}
        ]
    }
    results2 = collection_restaurants.find(restaurant_query)
    restaurants = list(results2)
    
    return render_template('results.html', articles=articles, restaurants=restaurants)

In [11]:
app.run(host='localhost', port=5003)

 * Serving Flask app 'Interactive App'
 * Debug mode: off


 * Running on http://localhost:5003
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [05/Nov/2024 15:21:44] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Nov/2024 15:21:49] "POST / HTTP/1.1" 200 -
