In [453]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import category_encoders as ce 
pd.set_option('future.no_silent_downcasting', True)

# TRAINING PART
# IMPORT CSV
hearts = pd.read_csv('train_heart.csv', sep=',')
# FILTER VALUES
chol_median = hearts.loc[hearts['Cholesterol'] != 0, 'Cholesterol'].median()
hearts['Cholesterol'].replace(0, chol_median, inplace=True)

resting_median = hearts.loc[hearts['RestingBP'] != 0, 'RestingBP'].median()
hearts['RestingBP'].replace(0, resting_median, inplace=True)

# # DROP UNNECESSARY COLS AND FIND y
X = hearts.drop(['id', 'HeartDisease'], axis=1)
y = hearts['HeartDisease']


# # ENCODE X
encoder = ce.OrdinalEncoder(cols=X.columns)
X_encoded = encoder.fit_transform(X)


# # DATA SPLICING (TEST AND TRAIN)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

# # FIT MODEL
clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# # Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
#update

# TESTING PART
# IMPORT CSV
test = pd.read_csv('test_heart.csv', sep=',')
#FILTER VALUES
chol_median = test.loc[test['Cholesterol'] != 0, 'Cholesterol'].median()
test['Cholesterol'].replace(0, chol_median, inplace=True)

resting_median = test.loc[test['RestingBP'] != 0, 'RestingBP'].median()
test['RestingBP'].replace(0, resting_median, inplace=True)

X_new = test.drop(['id'], axis=1)

# # ENCODE X
X_new_encoded = encoder.transform(X_new)


# # Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy in test: {accuracy}')

# # PREDICT AND PUT INTO PANDAS DATAFRAME
predictions = clf.predict(X_new_encoded)
id_to_prediction_df = pd.DataFrame({
    'id': test['id'],
    'HeartDisease': predictions
})

# # OUTPUT VIA CSV\a
file_name = './submissiontest.csv'
id_to_prediction_df.to_csv(file_name, index=False)

print(f"File saved as {file_name}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hearts['Cholesterol'].replace(0, chol_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hearts['RestingBP'].replace(0, resting_median, inplace=True)


In [None]:
print(X_encoded)

     Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0      1    1              1          1            1          1           1   
1      2    1              1          2            2          2           2   
2      3    1              1          3            3          2           1   
3      3    1              2          3            4          2           3   
4      4    1              1          4            5          1           1   
..   ...  ...            ...        ...          ...        ...         ...   
637    9    1              2          4          108          2           1   
638    6    1              1         16            3          1           3   
639    9    2              3         10          160          2           3   
640   30    1              4         24           83          1           2   
641   30    1              1         39            3          2           3   

     MaxHR  ExerciseAngina  Oldpeak  ST_Slope  
0  