In [39]:
# Set dependencies
import requests
import pandas as pd

In [40]:
# Set JSON source and request
url = "https://data.cdc.gov/resource/96sd-hxdt.json?$query=SELECT%20%60statefips%60%2C%20avg(%60ds_pm_pred%60)%20AS%20%60avg_ds_pm_pred%60%2C%20%60year%60%0AGROUP%20BY%20%60statefips%60%2C%20%60year%60"
response = requests.get(url)

In [41]:
# Import JSON data with error check
if response.status_code == 200:
    PM25byState = response.json()
else:
    raise Exception("Failed to fetch data from the API")

In [42]:
# Set measure to ensure JSON pull is less than the limit allowed by CDC API (i.e. <=1000)
len(PM25byState)

245

In [43]:
# Create dataframe from imported data
PM25byState_df = pd.DataFrame(PM25byState)

In [44]:
# Display dataframe for review
PM25byState_df

Unnamed: 0,statefips,avg_ds_pm_pred,year
0,1,9.7253863786574711,2016
1,1,9.3187528553336354,2017
2,1,8.9118843575354085,2018
3,1,9.1673524997966700,2019
4,1,8.7303361112061786,2020
...,...,...,...
240,9,7.1726999372804155,2016
241,9,6.4590581125964605,2017
242,9,7.3704599467918106,2018
243,9,7.2835365996331609,2019


In [45]:
# Save dataframe to CSV for BI/Tableau analysis
PM25byState_df.to_csv("Resources/PM25byState.csv", index=False)

In [139]:
# LEFT EMPTY TO BREAK UP CODE BLOCKS

In [140]:
# Read the TSV file from the Resources folder into a Pandas DataFrame
lung_cancer_data_df = pd.read_csv('Resources/nsclc_tcga_broad_2016_clinical_data.tsv', sep='\t')
lung_cancer_data_df

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,Age At Surgery,Cancer Type,Cancer Type Detailed,Death from Initial Pathologic Diagnosis Date,Days to Last Followup,Fraction Genome Altered,...,Overall Survival (Months),Overall Survival Status,Number of Samples Per Patient,Sex,Smoking History,Person Cigarette Smoking History Pack Year Value,Somatic Status,Stage,TMB (nonsynonymous),T Stage
0,nsclc_tcga_broad_2016,LUAD-2GUGK,LUAD-2GUGK-Tumor,,68.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0174,...,,,1,Female,"Current Reformed Smoker, Duration Not Specified",30.00,Matched,IIA,7.933333,
1,nsclc_tcga_broad_2016,LUAD-5O6B5,LUAD-5O6B5-Tumor,,56.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.0356,...,,,1,Female,Lifelong Non-Smoker,0.00,Matched,IB,1.033333,
2,nsclc_tcga_broad_2016,LUAD-5V8LT,LUAD-5V8LT-Tumor,,52.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.3895,...,,,1,Male,"Current Reformed Smoker, Duration Not Specified",6.00,Matched,IB,49.433333,
3,nsclc_tcga_broad_2016,LUAD-74TBW,LUAD-74TBW-Tumor,,73.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.1757,...,,,1,Male,"Current Reformed Smoker, Duration Not Specified",61.25,Matched,IB,7.800000,
4,nsclc_tcga_broad_2016,LUAD-AEIUF,LUAD-AEIUF-Tumor,,60.0,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,,0.1942,...,,,1,Female,"Current Reformed Smoker, Duration Not Specified",73.50,Matched,,5.333333,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,nsclc_tcga_broad_2016,TCGA-O2-A52V,TCGA-O2-A52V-01,75.0,,Non-Small Cell Lung Cancer,Lung Squamous Cell Carcinoma,1335.0,,0.2382,...,43.91,1:DECEASED,1,Female,Current Reformed Smoker For < Or = 15 Years,1.00,Matched,II,7.033333,T3
1140,nsclc_tcga_broad_2016,TCGA-O2-A52W,TCGA-O2-A52W-01,63.0,,Non-Small Cell Lung Cancer,Lung Squamous Cell Carcinoma,261.0,,0.5420,...,8.59,1:DECEASED,1,Male,Current Reformed Smoker For < Or = 15 Years,2.50,Matched,I,3.400000,T2
1141,nsclc_tcga_broad_2016,TCGA-O2-A5IB,TCGA-O2-A5IB-01,71.0,,Non-Small Cell Lung Cancer,Lung Squamous Cell Carcinoma,340.0,,0.4405,...,11.18,1:DECEASED,1,Female,Current Reformed Smoker For < Or = 15 Years,2.50,Matched,III,7.200000,T3
1142,nsclc_tcga_broad_2016,TCGA-S2-AA1A,TCGA-S2-AA1A-01,68.0,,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,,121.0,0.0598,...,4.00,0:LIVING,1,Female,Current Reformed Smoker For > 15 Years,95.00,Matched,IA,3.633333,T1b


In [141]:
lung_cancer_data_df['Cancer Type Detailed'] = lung_cancer_data_df['Cancer Type Detailed'].replace(
    {'Lung Adenocarcinoma': 1,
     'Lung Squamous Cell Carcinoma': 0})

lung_cancer_data_df['Prior Cancer Diagnosis Occurence'] = lung_cancer_data_df['Prior Cancer Diagnosis Occurence'].replace(
    {'No': 0,
     'Yes, history of prior malignancy': 1,
     'Yes, history of synchronous and or bilateral malignancy': 1})

lung_cancer_data_df['Sex'] = lung_cancer_data_df['Sex'].replace(
    {'Male': 0,
     'Female': 1
     })

lung_cancer_data_df['Smoking History'] = lung_cancer_data_df['Smoking History'].replace(
    {'Lifelong Non-Smoker': 0,
     'Current Smoker': 1,
     'Current Reformed Smoker For < Or = 15 Years': 1,
     'Current Reformed Smoker For > 15 Years':1,
     'Current Reformed Smoker, Duration Not Specified':1})

  lung_cancer_data_df['Cancer Type Detailed'] = lung_cancer_data_df['Cancer Type Detailed'].replace(
  lung_cancer_data_df['Prior Cancer Diagnosis Occurence'] = lung_cancer_data_df['Prior Cancer Diagnosis Occurence'].replace(
  lung_cancer_data_df['Sex'] = lung_cancer_data_df['Sex'].replace(
  lung_cancer_data_df['Smoking History'] = lung_cancer_data_df['Smoking History'].replace(


In [142]:
# Build DF for all float value columns
lung_cancer_data_df_new = lung_cancer_data_df[['Diagnosis Age',
                                            #    'Fraction Genome Altered',
                                            #    'Mutation Count',
                                               'Overall Survival Status',
                                            #    'TMB (nonsynonymous)',
                                            #    'Cancer Type Detailed',
                                            #    'Prior Cancer Diagnosis Occurence',
                                            #    'Smoking History',
                                            #    'Sex',
                                            #    'Person Cigarette Smoking History Pack Year Value'
                                            ]]

# Drop columns populated with a single value
# lung_cancer_data_df_dropped = lung_cancer_data_df.drop(['Study ID',
#                                                         'Cancer Type',
#                                                         'Number of Samples Per Patient',
#                                                         'Somatic Status'], axis=1)
# Drop rows with NA values in 'Overall Survival Status'
lung_cancer_data_df_cleaned = lung_cancer_data_df_new.dropna()
lung_cancer_data_df_cleaned

Unnamed: 0,Diagnosis Age,Overall Survival Status
159,70.0,0:LIVING
160,67.0,0:LIVING
161,79.0,1:DECEASED
162,68.0,0:LIVING
163,66.0,0:LIVING
...,...,...
1139,75.0,1:DECEASED
1140,63.0,1:DECEASED
1141,71.0,1:DECEASED
1142,68.0,0:LIVING


In [143]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lung_cancer_data_df_cleaned['Overall Survival Status']

# Separate the X variable, the features
X = lung_cancer_data_df_cleaned.drop(columns='Overall Survival Status')


In [144]:
# Review the y variable Series
y.head(10)

159      0:LIVING
160      0:LIVING
161    1:DECEASED
162      0:LIVING
163      0:LIVING
164      0:LIVING
165      0:LIVING
166    1:DECEASED
167    1:DECEASED
168    1:DECEASED
Name: Overall Survival Status, dtype: object

In [145]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Diagnosis Age
159,70.0
160,67.0
161,79.0
162,68.0
163,66.0


In [146]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [147]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='liblinear',
                                max_iter=1200,
                                random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

In [148]:
# Make a prediction using the testing data
test_predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": test_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0:LIVING,0:LIVING
1,0:LIVING,0:LIVING
2,0:LIVING,0:LIVING
3,0:LIVING,0:LIVING
4,0:LIVING,1:DECEASED
5,0:LIVING,0:LIVING
6,0:LIVING,0:LIVING
7,0:LIVING,0:LIVING
8,0:LIVING,0:LIVING
9,0:LIVING,1:DECEASED


In [149]:
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, test_predictions)

In [150]:
# Print the classification report for the model
print(test_matrix)

# Create and save the testing classification report
testing_report = classification_report(y_test, test_predictions)

# Print the testing classification report
print(testing_report)

[[165   0]
 [ 74   0]]
              precision    recall  f1-score   support

    0:LIVING       0.69      1.00      0.82       165
  1:DECEASED       0.00      0.00      0.00        74

    accuracy                           0.69       239
   macro avg       0.35      0.50      0.41       239
weighted avg       0.48      0.69      0.56       239



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
