### Import libraries and load data

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier


train_data = pd.read_csv("ML101_train_dataset.csv")
test_data = pd.read_csv("ML101_dataset_test_feature.csv")

target_features = ['Systolic BP', 'Diastolic BP']
numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == "object"]


### Fill NaN values using K-NN imputation

In [2]:

# drop first column
# train_data.drop(train_data.columns[0], axis=1, inplace=True)
data = train_data[numerical_cols].dropna(thresh=9)
# drop rows with missing target
data.dropna(subset=target_features, inplace=True)

imputer = KNNImputer(n_neighbors=5)
imputer.fit(data)
data = pd.DataFrame(imputer.transform(data), columns=data.columns)


### Save the imputer for later use

In [3]:
na_filled_data = data


### Perform linear regression

In [5]:
from sklearn.linear_model import LinearRegression

x_train_cols = [col for col in data.columns if col not in target_features]
x_train = data[x_train_cols]
y_train = data[target_features]
x_test = test_data[x_train_cols]

# For Systolic BP
lr = LinearRegression()
lr.fit(x_train, y_train['Systolic BP'])
sys_pred = lr.predict(x_test)
print(sys_pred)

# For Diastolic BP
lr = LinearRegression()
lr.fit(x_train, y_train['Diastolic BP'])
dia_pred = lr.predict(x_test)
print(dia_pred)

[184.6198455  236.20606093 184.39996347 ... 182.66436659 240.36077657
 218.73457401]
[ 84.1455178  106.91911572  76.97858708 ...  81.23055905 109.15988902
 103.09938601]


### Save Predictions to CSV file

In [6]:
predictions = pd.DataFrame({'Systolic BP': sys_pred, 'Diastolic BP': dia_pred})

# add index as last column with name 'ID'
predictions['ID'] = pd.Series(range(1, len(predictions) + 1))
predictions.set_index('ID', inplace=True)

# save to csv
predictions.to_csv('kaggle.csv')

### Perform Classification of LifeStyle

In [11]:
# add column 'Lifestyle' from train_data to data
data['LifeStyle'] = train_data['LifeStyle']

# drop rows with missing target
data.dropna(subset=['LifeStyle'], inplace=True)

X_train = data[x_train_cols]
Y_train = data["LifeStyle"]
X_test = na_filled_data[x_train_cols]

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(X_train, Y_train)

rf_pred = rf.predict(X_test)
print(rf_pred)

predictions = pd.DataFrame({'LifeStyle': rf_pred})

# add index as last column with name 'ID'
predictions['ID'] = pd.Series(range(1, len(predictions) + 1))
predictions.set_index('ID', inplace=True)

# save to csv
predictions.to_csv('ml101.csv')


['Bad' 'Bad' 'Bad' ... 'Bad' 'Bad' 'Bad']
