In [None]:
# Dependencies
import pandas as pd
import numpy as np

# import psycopg2
from sqlalchemy import create_engine
import statistics

In [None]:
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [None]:
# example of a dummy variable encoding
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

## cleaning data

## mortality data

### trying hot encode

In [7]:
# Import file
file = '../Data/Breast_cancer_bySex_Race_mortalityData.txt'
df = pd.read_csv(file, encoding="ISO-8859-1", sep='\t')
display(df)

Unnamed: 0,Notes,Leading Cancer Sites,Leading Cancer Sites Code,State,State Code,Year,Year Code,Race,Race Code,Sex,Sex Code,Deaths,Population,Age-Adjusted Rate
0,,Breast,26000.0,Alabama,1.0,2000.0,2000.0,Black or African American,2054-5,Female,F,166.0,627439.0,29.7
1,,Breast,26000.0,Alabama,1.0,2000.0,2000.0,White,2106-3,Female,F,524.0,1643761.0,25.9
2,,Breast,26000.0,Alabama,1.0,2001.0,2001.0,Black or African American,2054-5,Female,F,219.0,630901.0,38.7
3,,Breast,26000.0,Alabama,1.0,2001.0,2001.0,White,2106-3,Female,F,528.0,1645541.0,25.8
4,,Breast,26000.0,Alabama,1.0,2002.0,2002.0,Black or African American,2054-5,Female,F,167.0,633374.0,29.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,data set. Death rates may differ slightly from...,,,,,,,,,,,,,
2017,"4. For the 2005 year, the Census Bureau estima...",,,,,,,,,,,,,
2018,Texas due to Hurricanes Katrina and Rita. CDC ...,,,,,,,,,,,,,
2019,"these states, nor are these counts included in...",,,,,,,,,,,,,


In [8]:
# Identify incomplete rows
df.count()

Notes                          54
Leading Cancer Sites         1967
Leading Cancer Sites Code    1967
State                        1967
State Code                   1967
Year                         1967
Year Code                    1967
Race                         1967
Race Code                    1967
Sex                          1967
Sex Code                     1967
Deaths                       1967
Population                   1967
Age-Adjusted Rate            1967
dtype: int64

In [9]:
# Drop all columns without any information
df_dropped = df.drop(columns=['Notes','Leading Cancer Sites Code','State Code','Race Code','Year Code','Sex','Age-Adjusted Rate','Year','Population'],axis=1)
df_dropped = df_dropped.dropna(how='any')
display(df_dropped)

Unnamed: 0,Leading Cancer Sites,State,Race,Sex Code,Deaths
0,Breast,Alabama,Black or African American,F,166.0
1,Breast,Alabama,White,F,524.0
2,Breast,Alabama,Black or African American,F,219.0
3,Breast,Alabama,White,F,528.0
4,Breast,Alabama,Black or African American,F,167.0
...,...,...,...,...,...
1962,Breast,Wyoming,White,F,50.0
1963,Breast,Wyoming,White,F,68.0
1964,Breast,Wyoming,White,F,68.0
1965,Breast,Wyoming,White,F,56.0


In [10]:
# Drop all columns without any information
# df_dropped['Year'] = df_dropped['Year'].apply(np.int32)
df_dropped['Deaths'] = df_dropped['Deaths'].apply(np.int32)
# df_dropped['Population'] = df_dropped['Population'].apply(np.int64)
display(df_dropped)

Unnamed: 0,Leading Cancer Sites,State,Race,Sex Code,Deaths
0,Breast,Alabama,Black or African American,F,166
1,Breast,Alabama,White,F,524
2,Breast,Alabama,Black or African American,F,219
3,Breast,Alabama,White,F,528
4,Breast,Alabama,Black or African American,F,167
...,...,...,...,...,...
1962,Breast,Wyoming,White,F,50
1963,Breast,Wyoming,White,F,68
1964,Breast,Wyoming,White,F,68
1965,Breast,Wyoming,White,F,56


In [11]:
df_dropped.count()

Leading Cancer Sites    1967
State                   1967
Race                    1967
Sex Code                1967
Deaths                  1967
dtype: int64

In [12]:
# retrieve the array of data
dataset = df_dropped.values

In [13]:
# separate into input and output columns
X = dataset[:, :-1].astype(str)
y = dataset[:, -1].astype(str)

In [14]:
# one hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
X = onehot_encoder.fit_transform(X)

In [15]:
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [16]:
# summarize the transformed data
print('Input', X.shape)
print(X[:10, :])

Input (1967, 58)
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 1. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.

In [17]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [None]:
# OneHotEncoder(handle_unknown='ignore')

In [18]:
# one-hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

In [19]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

In [23]:
label_encoder.classes_
label_encoder.classes_ = np.append(label_encoder.classes_, "new_value")

In [28]:
cat_columns = list(df_dropped)

In [31]:
encoders = {'cat_columns'}
for i in cat_columns:
    encoders[i] = LabelEncoder()
    df[i] = encoders[i].fit_transform(df[i])
    
for i in cat_columns:
    df1[i] = encoders[i].transform(df1[i])

TypeError: 'set' object does not support item assignment

In [24]:
# label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
# define the model
model = LogisticRegression()