In [7]:
# Dependencies
import pandas as pd
import requests
import json

In [16]:
# URL for GET requests to retrieve vehicle data
url = "https://data.cdc.gov/resource/n8mc-b4w4.json?$limit=1000&$offset=0&death_yn=No"

In [17]:
# Pretty print JSON for all launchpads
response = requests.get(url).json()
print(json.dumps(response, indent=4, sort_keys=True))

[
    {
        "age_group": "NA",
        "case_month": "2020-05",
        "case_onset_interval": "0.0",
        "county_fips_code": "21179",
        "current_status": "Laboratory-confirmed case",
        "death_yn": "NA",
        "ethnicity": "NA",
        "exposure_yn": "Missing",
        "hosp_yn": "Yes",
        "icu_yn": "Missing",
        "process": "Missing",
        "race": "NA",
        "res_county": "NELSON",
        "res_state": "KY",
        "sex": "NA",
        "state_fips_code": "21",
        "symptom_status": "Symptomatic"
    },
    {
        "age_group": "0 - 17 years",
        "case_month": "2020-12",
        "case_onset_interval": "0.0",
        "county_fips_code": "39161",
        "current_status": "Laboratory-confirmed case",
        "death_yn": "No",
        "ethnicity": "NA",
        "exposure_yn": "Unknown",
        "hosp_yn": "No",
        "icu_yn": "Missing",
        "process": "Clinical evaluation",
        "race": "NA",
        "res_county": "VAN WERT",
   

In [18]:
meow = pd.DataFrame(response)
meow

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,case_positive_specimen,underlying_conditions_yn
0,2020-05,KY,21,NELSON,21179,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Yes,Missing,,,
1,2020-12,OH,39,VAN WERT,39161,0 - 17 years,,,,0.0,Clinical evaluation,Unknown,Laboratory-confirmed case,Symptomatic,No,Missing,No,,
2,2021-05,OH,39,WYANDOT,39175,0 - 17 years,,,,0.0,Clinical evaluation,Unknown,Probable Case,Symptomatic,No,No,No,0.0,
3,2020-12,MI,26,HURON,26063,18 to 49 years,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,,
4,2020-05,WI,55,POLK,55095,18 to 49 years,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2020-11,MN,27,LE SUEUR,27079,50 to 64 years,Male,Unknown,Missing,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,0.0,
996,2020-09,NY,36,GENESEE,36037,65+ years,Male,Missing,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,6.0,
997,2020-11,MI,26,HOUGHTON,26061,65+ years,Male,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,,,
998,2020-11,OR,41,JEFFERSON,41031,50 to 64 years,Female,American Indian/Alaska Native,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,,


# Quantifying Linear Regression

Create a model to quantify

In [1]:
# Import dependencies
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

# Generate some data
X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4, bias=100.0)

# Create a linear model
model = LinearRegression()

# Fit (Train) our model to the data
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Quantifying our Model

* Mean Squared Error (MSE)

* R2 Score

There are a variety of ways to quantify the model, but MSE and R2 are very common

In [2]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = model.predict(X)

# Score the prediction with mse and r2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

Mean Squared Error (MSE): 11.933040779746149
R-squared (R2 ): 0.903603363418708


A "good" MSE score will be close to zero while a "good" [R2 Score](https://en.wikipedia.org/wiki/Coefficient_of_determination) will be close to 1.

R2 Score is the default scoring for many of the Sklearn models

In [4]:
# Overall Score for the model
model.score(X, y)

0.90360336341870795

## Validation

We also want to understand how well our model performs on new data. 

One approach for this is to split your data into a training and testing dataset.

You fit (train) the model using training data, and score and validate your model using the testing data.

This train/test splitting is so common that Sklearn provides a mechanism for doing this. 

## Testing and Training Data

In order to quantify our model against new input values, we often split the data into training and testing data. The model is then fit to the training data and scored by the test data. Sklean pre-processing provides a library for automatically splitting up the data into training and testing

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Train the model using the training data

In [6]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

And score the model using the unseen testing data

In [7]:
model.score(X_test, y_test)

0.92525224350441038

## Your Turn!