In [27]:
# Import library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [62]:
# Load in dataframe
df = pd.read_csv("COVID-19BehaviorData_CAN_USA.csv")

In [63]:
# Print the table of the data types for every variable in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28825 entries, 0 to 28824
Data columns (total 79 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Index               28825 non-null  int64  
 1   RecordNo            28825 non-null  object 
 2   endtime             28825 non-null  object 
 3   qweek               28825 non-null  object 
 4   i1_health           28825 non-null  int64  
 5   i2_health           28825 non-null  int64  
 6   i7a_health          28825 non-null  int64  
 7   i3_health           28825 non-null  object 
 8   i4_health           28825 non-null  object 
 9   i5_health_1         28825 non-null  object 
 10  i5_health_2         28825 non-null  object 
 11  i5_health_3         28825 non-null  object 
 12  i5_health_4         28825 non-null  object 
 13  i5_health_5         28825 non-null  object 
 14  i5_health_99        28825 non-null  object 
 15  i5a_health          28825 non-null  object 
 16  i6_h

In [64]:
df_final

Unnamed: 0,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,i5_health_3,i5_health_4,i5_health_5,...,d1_health_13,d1_health_98,d1_health_99,weight,gender,age,region_state,household_size,household_children,employment_status
103,,,,,,,,,,,...,,,,,,,,,,
110,,,,,,,,,,,...,,,,,,,,,,
127,,,,,,,,,,,...,,,,,,,,,,
128,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28789,,,,,,,,,,,...,,,,,,,,,,
28805,,,,,,,,,,,...,,,,,,,,,,
28811,,,,,,,,,,,...,,,,,,,,,,
28814,,,,,,,,,,,...,,,,,,,,,,


In [65]:
# Exclude non-related features
df = df.drop(["Index", "RecordNo", "endtime", "qweek"], axis = 1)

In [75]:
df

Unnamed: 0,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,i5_health_3,i5_health_4,i5_health_5,...,d1_health_13,d1_health_98,d1_health_99,weight,gender,age,region_state,household_size,household_children,employment_status
0,0,1,1,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,No,1.036370,Male,73,British Columbia / Colombie Britanique,1,0,Retired
1,0,6,0,"No, I have not","No, they have not",Yes,No,No,No,No,...,No,No,Yes,1.016961,Female,37,British Columbia / Colombie Britanique,1,0,Unemployed
2,6,1,0,"No, I have not","No, they have not",No,No,No,No,Yes,...,No,No,No,1.003516,Male,25,Ontario,8 or more,2,Full time employment
3,1,1,1,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,No,1.002949,Male,66,Ontario,2,0,Retired
4,1,3,1,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,Yes,1.028216,Female,25,British Columbia / Colombie Britanique,2,0,Unemployed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28820,2,1,1,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,Yes,1.335790,Female,27,Oklahoma,3,2,Full time employment
28821,1,1,0,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,No,1.413918,Female,74,California,1,0,Retired
28822,4,0,0,"No, I have not","No, they have not",No,No,No,No,No,...,No,No,Yes,1.044810,Female,26,Rhode Island,4,2,Full time employment
28823,1,0,1,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,2.919237,Male,22,Mississippi,5,0,Full time employment


In [109]:
# Filter out only the patients with confirmed results (negative or positive)
df_final = df[(df["i3_health"] == "Yes, and I tested negative") | (df["i3_health"] == "Yes, and I tested positive")]
df_final

Unnamed: 0,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,i5_health_3,i5_health_4,i5_health_5,...,d1_health_13,d1_health_98,d1_health_99,weight,gender,age,region_state,household_size,household_children,employment_status
103,1,1,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.007474,Male,54,Quebec / Québec,Don't know,1,Full time employment
110,2,5,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,Yes,No,...,No,No,No,1.012842,Male,34,Quebec / Québec,1,1,Unemployed
127,3,15,3,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.992531,Male,36,Ontario,4,4,Full time employment
128,10,10,1,"Yes, and I tested positive","No, they have not",No,No,No,No,No,...,No,No,No,0.994496,Female,64,Ontario,2,0,Not working
134,2,10,1,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.984166,Female,36,Ontario,3,1,Full time employment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28789,2,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,No,Yes,...,No,No,No,0.683871,Male,26,Washington,5,3,Full time student
28805,5,4,2,"Yes, and I tested positive","Yes, and they tested positive",No,Yes,No,No,No,...,No,No,No,0.566747,Male,20,New York,6,3,Full time employment
28811,0,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,Yes,No,...,No,No,No,0.744511,Female,23,Ohio,4,2,Full time employment
28814,1,2,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.302322,Male,18,New York,5,2,Part time employment


In [110]:
df_final

Unnamed: 0,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,i5_health_3,i5_health_4,i5_health_5,...,d1_health_13,d1_health_98,d1_health_99,weight,gender,age,region_state,household_size,household_children,employment_status
103,1,1,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.007474,Male,54,Quebec / Québec,Don't know,1,Full time employment
110,2,5,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,Yes,No,...,No,No,No,1.012842,Male,34,Quebec / Québec,1,1,Unemployed
127,3,15,3,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.992531,Male,36,Ontario,4,4,Full time employment
128,10,10,1,"Yes, and I tested positive","No, they have not",No,No,No,No,No,...,No,No,No,0.994496,Female,64,Ontario,2,0,Not working
134,2,10,1,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.984166,Female,36,Ontario,3,1,Full time employment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28789,2,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,No,Yes,...,No,No,No,0.683871,Male,26,Washington,5,3,Full time student
28805,5,4,2,"Yes, and I tested positive","Yes, and they tested positive",No,Yes,No,No,No,...,No,No,No,0.566747,Male,20,New York,6,3,Full time employment
28811,0,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,Yes,No,...,No,No,No,0.744511,Female,23,Ohio,4,2,Full time employment
28814,1,2,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.302322,Male,18,New York,5,2,Part time employment


In [111]:
enc_dict

{'Yes': 1, 'No': 0}

In [112]:
# Data cleaning
enc_dict = dict()
enc_dict["Yes"] = 1
enc_dict["No"] = 0
#enc_dict[" "] = np.nan

In [113]:
df_col = df_final.columns.tolist()

In [114]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1795 entries, 103 to 28823
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   i1_health           1795 non-null   int64  
 1   i2_health           1795 non-null   int64  
 2   i7a_health          1795 non-null   int64  
 3   i3_health           1795 non-null   object 
 4   i4_health           1795 non-null   object 
 5   i5_health_1         1795 non-null   object 
 6   i5_health_2         1795 non-null   object 
 7   i5_health_3         1795 non-null   object 
 8   i5_health_4         1795 non-null   object 
 9   i5_health_5         1795 non-null   object 
 10  i5_health_99        1795 non-null   object 
 11  i5a_health          1795 non-null   object 
 12  i6_health           1795 non-null   object 
 13  i7b_health          1795 non-null   object 
 14  i8_health           1795 non-null   object 
 15  i9_health           1795 non-null   object 
 16  i10

In [115]:
df_final['i5_health_1'].apply(lambda x: enc_dict[x] if x in enc_dict.keys() else x).value_counts()

0    1489
1     247
       59
Name: i5_health_1, dtype: int64

In [116]:
df_final

Unnamed: 0,i1_health,i2_health,i7a_health,i3_health,i4_health,i5_health_1,i5_health_2,i5_health_3,i5_health_4,i5_health_5,...,d1_health_13,d1_health_98,d1_health_99,weight,gender,age,region_state,household_size,household_children,employment_status
103,1,1,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.007474,Male,54,Quebec / Québec,Don't know,1,Full time employment
110,2,5,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,Yes,No,...,No,No,No,1.012842,Male,34,Quebec / Québec,1,1,Unemployed
127,3,15,3,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.992531,Male,36,Ontario,4,4,Full time employment
128,10,10,1,"Yes, and I tested positive","No, they have not",No,No,No,No,No,...,No,No,No,0.994496,Female,64,Ontario,2,0,Not working
134,2,10,1,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,0.984166,Female,36,Ontario,3,1,Full time employment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28789,2,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,No,Yes,...,No,No,No,0.683871,Male,26,Washington,5,3,Full time student
28805,5,4,2,"Yes, and I tested positive","Yes, and they tested positive",No,Yes,No,No,No,...,No,No,No,0.566747,Male,20,New York,6,3,Full time employment
28811,0,2,2,"Yes, and I tested negative","Yes, and they tested negative",Yes,No,No,Yes,No,...,No,No,No,0.744511,Female,23,Ohio,4,2,Full time employment
28814,1,2,2,"Yes, and I tested negative","Yes, and they tested negative",No,No,No,No,No,...,No,No,Yes,1.302322,Male,18,New York,5,2,Part time employment


In [122]:
df_clean = pd.DataFrame()
df_final = df_final.reset_index(drop = True)

In [118]:
df_final["i5_health_1"].apply(lambda x: enc_dict[x] if x in enc_dict.keys() else x)

0       0
1       0
2       0
3       0
4       0
       ..
1790    1
1791    0
1792    1
1793    0
1794    0
Name: i5_health_1, Length: 1795, dtype: object

In [120]:
df_clean['test'] = df_final["i5_health_1"].apply(lambda x: enc_dict[x] if x in enc_dict.keys() else x)
df_clean

Unnamed: 0,test
0,0
1,0
2,0
3,0
4,0
...,...
1790,1
1791,0
1792,1
1793,0


In [123]:
for col in df_col:
    print(col)
    df_clean[col] = df_final[col].apply(lambda x: enc_dict[x] if x in enc_dict.keys() else x)

i1_health
i2_health
i7a_health
i3_health
i4_health
i5_health_1
i5_health_2
i5_health_3
i5_health_4
i5_health_5
i5_health_99
i5a_health
i6_health
i7b_health
i8_health
i9_health
i10_health
i11_health
i12_health_1
i12_health_2
i12_health_3
i12_health_4
i12_health_5
i12_health_6
i12_health_7
i12_health_8
i12_health_9
i12_health_10
i12_health_11
i12_health_12
i12_health_13
i12_health_14
i12_health_15
i12_health_16
i12_health_17
i12_health_18
i12_health_19
i12_health_20
i13_health
i14_health_1
i14_health_2
i14_health_3
i14_health_4
i14_health_5
i14_health_6
i14_health_7
i14_health_8
i14_health_9
i14_health_10
i14_health_96
i14_health_98
i14_health_99
i14_health_other
d1_health_1
d1_health_2
d1_health_3
d1_health_4
d1_health_5
d1_health_6
d1_health_7
d1_health_8
d1_health_9
d1_health_10
d1_health_11
d1_health_12
d1_health_13
d1_health_98
d1_health_99
weight
gender
age
region_state
household_size
household_children
employment_status


In [154]:
df_clean['i12_health_2'].value_counts()

Always        1067
Frequently     406
Sometimes      193
Rarely          87
Not at all      42
Name: i12_health_2, dtype: int64

In [None]:
'i3_health''i4_health''i6_health''i10_health''i11_health''i12_health_1'...
'i5a_health''i8_health''i9_health' - 1, 0, 'not sure'

In [125]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   i1_health           1795 non-null   int64  
 1   i2_health           1795 non-null   int64  
 2   i7a_health          1795 non-null   int64  
 3   i3_health           1795 non-null   object 
 4   i4_health           1795 non-null   object 
 5   i5_health_1         1795 non-null   object 
 6   i5_health_2         1795 non-null   object 
 7   i5_health_3         1795 non-null   object 
 8   i5_health_4         1795 non-null   object 
 9   i5_health_5         1795 non-null   object 
 10  i5_health_99        1795 non-null   object 
 11  i5a_health          1795 non-null   object 
 12  i6_health           1795 non-null   object 
 13  i7b_health          1795 non-null   object 
 14  i8_health           1795 non-null   object 
 15  i9_health           1795 non-null   object 
 16  i10_he

In [130]:
df_clean = df_clean.replace(" ", np.nan)
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   i1_health           1795 non-null   int64  
 1   i2_health           1795 non-null   int64  
 2   i7a_health          1795 non-null   int64  
 3   i3_health           1795 non-null   object 
 4   i4_health           1701 non-null   object 
 5   i5_health_1         1736 non-null   float64
 6   i5_health_2         1736 non-null   float64
 7   i5_health_3         1736 non-null   float64
 8   i5_health_4         1736 non-null   float64
 9   i5_health_5         1736 non-null   float64
 10  i5_health_99        1736 non-null   float64
 11  i5a_health          669 non-null    object 
 12  i6_health           669 non-null    object 
 13  i7b_health          574 non-null    float64
 14  i8_health           578 non-null    object 
 15  i9_health           1657 non-null   object 
 16  i10_he

# Problems to handle

## 1. How to proceed with the data cleaning (1-yes, 2-no, 3-not sure...?)

## 2. What about other columns with different text value (one-hot encoding?)

## 3. Are the selection of features and target ok?

## 4. What is the threshold for determing overfitting for KNN regression? (PCA needed?)


In [None]:
# Determine the columns for X and columns for y
X = df_final.drop(["i3_health"], axis = 1)
y = df_final["i3_health"]

In [131]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [None]:
# Scale the data
scaler = StandardScaler().fit(X_train)
rescaledX_train = scaler.transform(X_train)

In [156]:
# Initialize k lvaues
k_values = np.arange(1, 25)

In [None]:
# Train to find the best value for k
param_grid = dict(n_neighbors = k_values)

In [None]:
# Create a KNN model object
model = KNeightborsRegressor()
kfd = KFold(n_splits = 10, shuffle = True, random_state = 1)

In [None]:
# GridSearch
grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'neg_mean_squared_error', cv = kfd)

In [None]:
# Output the GridSearch result
grid_result = grid.fit(rescaledX_train, y_train)

In [None]:
# Print the result
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# Check some parameters of the grid search results
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [155]:
# Plot the score out for better view results
plt.plot(k_values, means)
plt.xlabel("Number of Neighbors K")
plt.ylabel("Negative Mean Squared Error")
plt.show()

In [None]:
# With knowing the optimal value for K
scaler = StandardScale(). fit(X_train)

rescaledX_train = scaler.transform(X_train)

In [None]:
# Fit in the best number of neighbors
model = KNeighborsRegressor(n_neighbors = TBD)

In [None]:
# Fit into the scaled values.
model.fit(rescaledX_train, y_train)

In [None]:
# Fill in the Test data
rescaledX_test = scaler.transform(X_test)

In [None]:
# Make the prediction using the trained model
estimates = model.predict(rescaledX_test)

In [None]:
# Check how good the model is
mean_squared_error(y_test, estimates)