In [150]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('K Nearest Neighbors').getOrCreate()
import pandas as pd

In [151]:
# Making a list of missing value types
missing_values = ["N/A", "Unknown"]

#read csv 
df_p = pd.read_csv('healthcare-dataset-stroke-data.csv', na_values = missing_values)
df4 = df_p[['id','gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','stroke']]


In [152]:
df_p

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [153]:
#check for missing values
df_p.isna().any()


id                   False
gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                   True
smoking_status        True
stroke               False
dtype: bool

In [154]:
#check how many missing values
df_p.isna().sum()

id                      0
gender                  0
age                     0
hypertension            0
heart_disease           0
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64

In [155]:
#drop bmi column
df_p.drop(['bmi'], axis='columns', inplace=True)

In [156]:
df_p

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,formerly smoked,0


In [157]:
#change text values to numerical values
# convert each category into a binary data column by assigning a 1 or 0
from sklearn.impute import KNNImputer
cat_variables = df_p[['gender','ever_married','work_type','Residence_type','smoking_status']]

#The “gender_Female” column is dropped since the “drop_first” parameter is set as True. Similarly, there are only 4 columns for “work_type” because the 5 one has been dropped (work_type_Govt_job)
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)
cat_dummies.head()

Unnamed: 0,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes
0,1,0,1,0,1,0,0,1,0,0
1,0,0,1,0,0,1,0,0,1,0
2,1,0,1,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,1,0,1
4,0,0,1,0,0,1,0,0,1,0


In [158]:
#drop the original ('gender','ever_married','work_type','Residence_type','smoking_status') columns from the data frame and add the dummy variables.

df_p = df_p.drop(['gender','ever_married','work_type','Residence_type','smoking_status'], axis='columns', inplace=True)
df_p = pd.concat([df_p, cat_dummies], axis='columns')
df_p.head()

Unnamed: 0,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes
0,1,0,1,0,1,0,0,1,0,0
1,0,0,1,0,0,1,0,0,1,0
2,1,0,1,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,1,0,1
4,0,0,1,0,0,1,0,0,1,0


In [159]:
#normalize data : scale our variables to have values between 0 and 1.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_p = pd.DataFrame(scaler.fit_transform(df_p), columns = df_p.columns)
df_p.head()

Unnamed: 0,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [160]:
#setting the parameter ‘n_neighbors’=5
# the missing values will be replaced by the mean value of 5 nearest neighbors measured by Euclidean distance

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_p = pd.DataFrame(imputer.fit_transform(df_p),columns = df_p.columns)

In [161]:
df_p.isna().any()

gender_Male                    False
gender_Other                   False
ever_married_Yes               False
work_type_Never_worked         False
work_type_Private              False
work_type_Self-employed        False
work_type_children             False
Residence_type_Urban           False
smoking_status_never smoked    False
smoking_status_smokes          False
dtype: bool

In [162]:
#check if our dataframe has missing values 
#checking if they have been imputed as the means of k-Nearest Neighbor values)

df_p.isna().sum()

gender_Male                    0
gender_Other                   0
ever_married_Yes               0
work_type_Never_worked         0
work_type_Private              0
work_type_Self-employed        0
work_type_children             0
Residence_type_Urban           0
smoking_status_never smoked    0
smoking_status_smokes          0
dtype: int64

In [163]:
df_p

Unnamed: 0,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_never smoked,smoking_status_smokes
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5105,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
5106,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
5107,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5108,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [164]:
#taking only the columns from smoking_status
df_smoking = df_p[['smoking_status_never smoked','smoking_status_smokes']]
df_smoking.insert(loc=0, column='smoking_status', value='0.0')

In [165]:
df_smoking

Unnamed: 0,smoking_status,smoking_status_never smoked,smoking_status_smokes
0,0.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
...,...,...,...
5105,0.0,1.0,0.0
5106,0.0,1.0,0.0
5107,0.0,1.0,0.0
5108,0.0,0.0,0.0


In [166]:
#creating a dataframe with the column of smoking_status like the column in the given dataset
def get_smoking(row):
      for c in df_smoking.columns:
            if row[c]==1:
                return c
df_smoking = df_smoking.apply(get_smoking, axis=1)
df_smoking = pd.DataFrame(df_smoking, columns =['smoking_status'])
df_smoking['smoking_status'] = df_smoking['smoking_status'].map({'smoking_status_never smoked':'never smoked', 'smoking_status_smokes':'smokes'})
df_smoking = df_smoking.fillna('formerly smoked')
df_smoking

Unnamed: 0,smoking_status
0,formerly smoked
1,never smoked
2,never smoked
3,smokes
4,never smoked
...,...
5105,never smoked
5106,never smoked
5107,never smoked
5108,formerly smoked


In [167]:
#drom the columns bmi and smoking status and insert the new column smoking_status
#df4.drop(['bmi','smoking_status'], axis='columns', inplace=True)
df4['smoking_status'] = df_smoking['smoking_status'].values
df4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['smoking_status'] = df_smoking['smoking_status'].values


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,stroke,smoking_status
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,1,formerly smoked
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,1,never smoked
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,1,never smoked
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,1,smokes
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,1,never smoked
...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,0,never smoked
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,0,never smoked
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,0,never smoked
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,0,formerly smoked


In [168]:
#the final df (re order the columns) 
df4 = df4[['id', 'gender', 'age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','smoking_status','stroke']]


In [169]:
df4

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,formerly smoked,0
