In [1]:
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import numpy as np

In [2]:
def scrape(*args):
    max_results_city = 1
    step = 1
    job_search = "data+analyst"
    if args:
        cities = set(args)
    else:
        cities = {"New York", "San Francisco"}

    city_search = [city.replace(" ", "+") for city in cities]
    city_search.sort()
    
    city_list = [city for city in cities] * 10
    city_list.sort()
   
    url_list = get_url_list(city_search, job_search, max_results_city, step)

    results = scrape_links(url_list)
    results["Search_City"] = city_list
    
    jobs_df = get_dataframe(results)
    # Save to new csv file
    output = os.path.join('..', '03_Data', 'Job_Data.csv')
    jobs_df.to_csv(output, header=True, index=True, index_label="Id", encoding="utf-8-sig")
    print("Done")
    return results

In [3]:
def get_url_list(city_search, job_search, max_results_city, step):
    url_list = []
    for city in city_search:
        for start in range(0, max_results_city, step):
            search_url = "http://www.indeed.com/jobs?q="+str(job_search)+"&l="+str(city)+"&start="+str(start)
            results_page = requests.get(search_url)
            #browser.visit(search_url)
            time.sleep(1)  #ensuring at least 1 second between page grabs
            #html = browser.html
            soup = BeautifulSoup(results_page.text, "lxml",)

            h2 = soup.find_all('h2', class_="jobtitle")
            for stuff in h2:
                href = stuff.find('a')['href']
                url = "http://www.indeed.com/"+str(href)
                url_list.append(url)
    return url_list

In [4]:
def scrape_links(url_list):
    job_company = []
    job_desc = []
    job_location = []
    job_title = []

    for url in url_list:
        job_page = requests.get(url,)
        time.sleep(1) 
        
        job_soup = BeautifulSoup(job_page.text, "lxml")
        
        title=company=location=job_summary=""
            
        title_object = job_soup.find("b", class_="jobtitle")
        if title_object is not None:
            title = title_object.text
        job_title.append(title)

        company_object = job_soup.find("span", class_="company")
        if company_object is not None:
            company = company_object.text
        job_company.append(company)

        location_object = job_soup.find("span", class_="location")
        if location_object is not None:
            location = location_object.text
        job_location.append(location)

        job_summary_object = job_soup.find("span", id="job_summary")
        if job_summary_object is not None:
            job_summary = job_summary_object.get_text()
            
        job_desc.append(job_summary)
        
    
    result = { "Title":job_title,
               "Description": job_desc, 
               "Location": job_location,
               "Company": job_company,
               "Link" : url_list }

    return result


In [5]:
def get_dataframe(results):
    jobs_df = pd.DataFrame.from_dict(results)

    jobs_df = jobs_df[["Title","Company", "Location","Description", "Search_City", "Link"]]
    jobs_df.index = np.arange(1, len(jobs_df) + 1)
    return jobs_df

In [6]:
result=scrape()

Done


In [25]:
view2=pd.DataFrame(result)
view2.head()

Unnamed: 0,Company,Description,Link,Location,Search_City,Title
0,"Atlas Air, Inc.",Overview\nProvide strategic design and mainten...,http://www.indeed.com//rc/clk?jk=d3e4bb1f83467...,New York State,New York,Business Intelligence Analyst
1,"Kamsa, Inc.",Description:Perform collection and inputting o...,http://www.indeed.com//company/Kamsa-Inc/jobs/...,"New York, NY",New York,Data Analyst
2,Acacia Technical Services,Title: Data Analyst\nLocation either NYC or Pa...,http://www.indeed.com//rc/clk?jk=f9bc2efbe1dfb...,New York State,New York,Data Analyst
3,QBE,To assist Development Leads for Proof of Conce...,http://www.indeed.com//rc/clk?jk=be1de9ec0bd2e...,New York State,New York,Data Analyst Intern
4,The New York Times,About the position:\nWirecutter is seeking a D...,http://www.indeed.com//rc/clk?jk=2f4ab0a8ac3e0...,"New York, NY",New York,Data Analyst


In [26]:
view2.shape

(20, 6)

In [40]:
grouped_view2=view2.groupby("Title")
count=grouped_view2.count()
count
# total=count["Location"].sum()
# total

Unnamed: 0_level_0,Company,Description,Link,Location,Search_City
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Analytics & Insights Data Analyst,1,1,1,1,1
"Associate Data Scientist, Global Marketing Analytics",1,1,1,1,1
Business Analyst,1,1,1,1,1
Business Intelligence Analyst,1,1,1,1,1
Data Analyst,7,7,7,7,7
Data Analyst Intern,1,1,1,1,1
Data Analyst/Sr Data Analyst,1,1,1,1,1
Data Analytics,1,1,1,1,1
"Data Engineer, Analytics",1,1,1,1,1
Data Scientist / Engineer,1,1,1,1,1


In [44]:
view2_summary_table = pd.DataFrame({
    "Count": count["Location"],
    "Total": total,
    "Percentage": count["Location"]/total*100
})
view2_summary_table.head()

Unnamed: 0_level_0,Count,Percentage,Total
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Analytics & Insights Data Analyst,1,5.0,20
"Associate Data Scientist, Global Marketing Analytics",1,5.0,20
Business Analyst,1,5.0,20
Business Intelligence Analyst,1,5.0,20
Data Analyst,7,35.0,20


In [58]:
type(list(view2_summary_table.index))

list

In [57]:
type(list(view2_summary_table.Percentage))

list

In [48]:
ls1=view2_summary_table["Percentage"]
ls1

Title
Analytics & Insights Data Analyst                        5.0
Associate Data Scientist, Global Marketing Analytics     5.0
Business Analyst                                         5.0
Business Intelligence Analyst                            5.0
Data Analyst                                            35.0
Data Analyst Intern                                      5.0
Data Analyst/Sr Data Analyst                             5.0
Data Analytics                                           5.0
Data Engineer, Analytics                                 5.0
Data Scientist / Engineer                                5.0
Jr. Data Analyst                                         5.0
Junior Analyst                                           5.0
Marketing Data Analyst                                   5.0
Product Analyst                                          5.0
Name: Percentage, dtype: float64

In [None]:
# // View 1 - Working Pie Chart
# var trace1 = {
#   labels: ["beer", "wine", "martini", "margarita",
#       "ice tea", "rum & coke", "mai tai", "gin & tonic"],
#   values: [22.7, 17.1, 9.9, 8.7, 7.2, 6.1, 6.0, 4.6],
#   type: 'pie'
# };

# var data = [trace1];

# var layout = {
#   title: "'Bar' Chart",
# };

# Plotly.newPlot("plot", data, layout);

In [2]:
def get_view2_data(data):
    # Get data to create plot for view2
    # added by: Lena Corredor
    view2_data=get_dataframe(data)
    # view2=pd.DataFrame(data)
    grouped_view2=view2.groupby("Title")
    count=grouped_view2.count()
    total=count["Location"].sum()
    view2_summary_table = pd.DataFrame({
        "Count": count["Location"],
        "Total": total,
        "Percentage": count["Location"]/total*100
    })
    return view2_summary_table

In [1]:
import os
from flask_pymongo import PyMongo
import scrape_page
import flask_sqlalchemy
from flask_sqlalchemy import SQLAlchemy

from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

from flask import (
    Flask,
    render_template,
    jsonify,
    request,
    redirect)

app = Flask(__name__)

app.config['SQLALCHEMY_DATABASE_URI'] = "sqlite:///db/datanalyticsjobs.sqlite"
# app.config['SQLALCHEMY_DATABASE_URI'] = "sqlite:///db/pets.sqlite"

db = SQLAlchemy(app)

# Define a JobPosition class
### BEGIN SOLUTION
class DataAnalyticsJob(db.Model):
    __tablename__ = "job_position"
    
    id =db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String)
    company = db.Column(db.String) 
    location = db.Column(db.String) 
    description = db.Column(db.Text) 
    search_city = db.Column(db.String) 
    link = db.Column(db.String) 
    
    def __repr__(self):
        return '<Position %r>' % (self.id)

  'SQLALCHEMY_TRACK_MODIFICATIONS adds significant overhead and '


In [2]:
import pandas as pd

In [3]:
def get_view2_data(data):
    # Get data to create plot for view2
    # added by: Lena Corredor
    # view2_data=get_dataframe(data)
    title = [result[1] for result in data]
    location = [result[3] for result in data]
    view2=pd.DataFrame({"title": title, "location":location})
    
    print(view2)
    grouped_view2=view2.groupby("title")
    count=grouped_view2.count()
    total=count["Location"].sum()
    view2_summary_table = pd.DataFrame({
        "Count": count["Location"],
        "Total": total,
        "Percentage": count["Location"]/total*100
    })
    return view2_summary_table

# @app.route("/api/pals")
# def pals():
#     results = db.session.query(Pet.type, func.count(Pet.type)).group_by(Pet.type).all()

#     pet_type = [result[0] for result in results]
#     age = [result[1] for result in results]

#     pet_data = {
#         "x": pet_type,
#         "y": age,
#         "type": "bar"
#     }

#     return jsonify(pet_data)

In [14]:
def view2():
    results = db.session.query(DataAnalyticsJob.title, DataAnalyticsJob.location).all()
#     results = db.session.query(Pet.type, func.count(Pet.type)).group_by(Pet.type).all()
#     print("LW")
#     print(db)
#     print(db.session.query(DataAnalyticsJob))
#     print(results)
#     view2_data = get_view2_data(results)
#     print(view2_data)
#     view2_data=pd.DataFrame(results)

    title = [result[0] for result in results]
    location = [result[1] for result in results]
    view2=pd.DataFrame({"Title": title, "Location":location})

    print(view2)
    grouped_view2=view2.groupby("Title")
    count=grouped_view2.count()
    total=count["Location"].sum()
    view2_summary_table = pd.DataFrame({
        "Count": count["Location"],
        "Total": total,
        "Percentage": count["Location"]/total*100
    })
    print(view2_summary_table)
    titles = list(view2_summary_table.index)
    percentages = list(view2_summary_table["Percentage"])
    plot2_data = {
        "titles": titles,
        "percentages": percentages,
        "type": "pie"
    }
    print(plot2_data)
#     return jsonify(plot2_data)

In [13]:
view2()

  Location          Title
0  def_loc  default_title
1     loc2         title2
               Count  Percentage  Total
Title                                  
default_title      1        50.0      2
title2             1        50.0      2
{'titles': ['default_title', 'title2'], 'percentages': [50.0, 50.0], 'type': 'pie'}


RuntimeError: Working outside of application context.

This typically means that you attempted to use functionality that needed
to interface with the current application object in a way.  To solve
this set up an application context with app.app_context().  See the
documentation for more information.