In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.ensemble import RandomForestClassifier as clf

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
# Getting Data Ready
# 1- Split data into features and label (define dependent and independent variables)
# 2- Fill missing values or data
# 3- Converting data types

In [8]:
data = pd.read_csv('Heart Attack Data.csv')
data

Unnamed: 0,age,sex,total_cholesterol,ldl,hdl,systolic_bp,diastolic_bp,smoking,diabetes,heart_attack
0,57,1,229.463642,175.879129,39.225687,124.070127,91.378780,0,0,0
1,58,1,186.464120,128.984916,34.950968,95.492552,64.355040,1,0,0
2,37,1,251.300719,152.347592,45.913288,99.519335,64.953147,0,1,0
3,55,1,192.058908,116.803684,67.208925,122.460002,73.821382,0,0,0
4,53,1,151.203448,107.017396,60.693838,123.022257,81.121946,0,1,0
...,...,...,...,...,...,...,...,...,...,...
995,65,1,195.336429,149.070951,43.914928,132.878440,86.246414,0,0,0
996,60,1,192.342928,134.357395,53.380714,145.149535,91.069141,0,0,0
997,70,0,174.179319,125.900047,50.406918,133.022878,85.851330,0,0,0
998,48,0,189.715685,152.388737,40.700912,113.527965,86.368294,0,0,0


In [9]:
# 1st variable
x = data.drop('diabetes', axis = 1)
x

Unnamed: 0,age,sex,total_cholesterol,ldl,hdl,systolic_bp,diastolic_bp,smoking,heart_attack
0,57,1,229.463642,175.879129,39.225687,124.070127,91.378780,0,0
1,58,1,186.464120,128.984916,34.950968,95.492552,64.355040,1,0
2,37,1,251.300719,152.347592,45.913288,99.519335,64.953147,0,0
3,55,1,192.058908,116.803684,67.208925,122.460002,73.821382,0,0
4,53,1,151.203448,107.017396,60.693838,123.022257,81.121946,0,0
...,...,...,...,...,...,...,...,...,...
995,65,1,195.336429,149.070951,43.914928,132.878440,86.246414,0,0
996,60,1,192.342928,134.357395,53.380714,145.149535,91.069141,0,0
997,70,0,174.179319,125.900047,50.406918,133.022878,85.851330,0,0
998,48,0,189.715685,152.388737,40.700912,113.527965,86.368294,0,0


In [10]:
# 2nd variable
y = data['diabetes']
y

0      0
1      0
2      1
3      0
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: diabetes, Length: 1000, dtype: int64

In [11]:
# Split the data

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [13]:
 x_train.shape, y_train.shape, x_test.shape, y_test.shape    # Starting 25% of data (in this starting 250 rows) is use for testing data

((750, 9), (750,), (250, 9), (250,))

In [14]:
len(data)          # Total Data 1000 = Train data 750 + Test data 250

1000

In [15]:
# Converting Data type (we need int data type to perform operations on it)

In [16]:
test_data = pd.read_csv('Covid Data.csv')
test_data

Unnamed: 0,Country,Other names,ISO 3166-1 alpha-3 CODE,Population,Continent,Total Cases,Total Deaths,Death percentage
0,Afghanistan,Afghanistan,AFG,40462186,Asia,177827,7671,4.313743
1,Albania,Albania,ALB,2872296,Europe,273870,3492,1.275058
2,Algeria,Algeria,DZA,45236699,Africa,265691,6874,2.587216
3,Andorra,Andorra,AND,77481,Europe,40024,153,0.382271
4,Angola,Angola,AGO,34654212,Africa,99194,1900,1.915438
...,...,...,...,...,...,...,...,...
220,Wallis and Futuna,Wallis and Futuna Islands,WLF,10894,Oceania,454,7,1.541850
221,Western Sahara,Western Sahara,ESHÂ,623031,Africa,10,1,10.000000
222,Yemen,Yemen,YEM,30975258,Asia,11806,2143,18.151787
223,Zambia,Zambia,ZMB,19284482,Africa,317076,3967,1.251120


In [17]:
test_data.dtypes

Country                     object
Other names                 object
ISO 3166-1 alpha-3 CODE     object
Population                   int64
Continent                   object
Total Cases                  int64
Total Deaths                 int64
Death percentage           float64
dtype: object

In [18]:
test_data['Death percentage'] = test_data['Death percentage'].round().astype(int)        # convert the float to int

In [19]:
test_data.dtypes       # Now the data type of Death percentage is int

Country                    object
Other names                object
ISO 3166-1 alpha-3 CODE    object
Population                  int64
Continent                  object
Total Cases                 int64
Total Deaths                int64
Death percentage            int64
dtype: object

In [20]:
x = test_data.drop('Death percentage', axis = 1)
y = test_data['Death percentage']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [21]:
covid_model = RandomForestRegressor()
# covid_model.fit(x_train, y_train)
# covid_model.score(x_test, y_test)   # could not convert string to float: 'Venezuela' >>> Shows this error

In [22]:
# We need integer data to perform action on it. 
# Transfromation Method

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [24]:
# 1) Hottie Method
string_col = ['Country', 'Other names', 'ISO 3166-1 alpha-3 CODE', 'Continent']
hottie = OneHotEncoder()

transformer = ColumnTransformer([('One Hottie', hottie, string_col )], remainder = 'passthrough')
transform_x = transformer.fit_transform(x)
pd.DataFrame(transform_x)

Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
...,...
220,<Compressed Sparse Row sparse matrix of dtype ...
221,<Compressed Sparse Row sparse matrix of dtype ...
222,<Compressed Sparse Row sparse matrix of dtype ...
223,<Compressed Sparse Row sparse matrix of dtype ...


In [25]:
# 2) Transformation Method
transform_data = pd.get_dummies(test_data[['Country', 'Other names', 'ISO 3166-1 alpha-3 CODE', 'Continent']])
transform_data

Unnamed: 0,Country_Afghanistan,Country_Albania,Country_Algeria,Country_Andorra,Country_Angola,Country_Anguilla,Country_Antigua and Barbuda,Country_Argentina,Country_Armenia,Country_Aruba,...,ISO 3166-1 alpha-3 CODE_YEM,ISO 3166-1 alpha-3 CODE_ZAF,ISO 3166-1 alpha-3 CODE_ZMB,ISO 3166-1 alpha-3 CODE_ZWE,Continent_Africa,Continent_Asia,Continent_Europe,Continent_Latin America and the Caribbean,Continent_Northern America,Continent_Oceania
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
221,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
222,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,True,False,False,False,False
223,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False


In [26]:
# ChatGPT METHOD
# Assuming your DataFrame is named 'x'
# Sample columns that are of object type and need conversion
object_columns = ['Country', 'Other names', 'ISO 3166-1 alpha-3 CODE', 'Continent']

# Attempt to convert object columns to numeric types (int)
# Handle errors where conversion might fail for non-numeric values (e.g., due to strings like 'N/A' or 'unknown')
for column in object_columns:
    try:
        x[column] = pd.to_numeric(x[column], errors='coerce')  # This will convert non-numeric values to NaN
    except Exception as e:
        print(f"Error converting {column}: {e}")

# After coercing, handle missing values (NaN) if necessary
x = x.fillna(0)  # Example: replace NaN values with 0

# Convert all remaining object columns (should be now numbers) to integers
x = x.astype({col: 'int64' for col in object_columns if x[col].dtype != 'int64'})

# Print the updated DataFrame with correct datatypes
print(x.dtypes)  # To check the types after conversion
print(x)  # To see the updated DataFrame


Country                    int64
Other names                int64
ISO 3166-1 alpha-3 CODE    int64
Population                 int64
Continent                  int64
Total Cases                int64
Total Deaths               int64
dtype: object
     Country  Other names  ISO 3166-1 alpha-3 CODE  Population  Continent  \
0          0            0                        0    40462186          0   
1          0            0                        0     2872296          0   
2          0            0                        0    45236699          0   
3          0            0                        0       77481          0   
4          0            0                        0    34654212          0   
..       ...          ...                      ...         ...        ...   
220        0            0                        0       10894          0   
221        0            0                        0      623031          0   
222        0            0                        0    30975258

In [27]:
# Fit the model again
x_train, x_test, y_train, y_test = train_test_split(transform_x, y, test_size = 0.3)
covid_model.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
covid_model.score(x_test, y_test)

0.13049973901098877

In [29]:
# Handling the missing values

In [72]:
# 1- Using Pandas method

In [47]:
missing_data = pd.read_csv('Missing Heart Attack Data.csv')
missing_data

Unnamed: 0,age,sex,chol,Idl,hdl,bp,smoking,diabetes,heart_attack
0,57,1,229.463642,175.879129,39.225687,91.37878,0.0,0.0,0
1,58,1,186.46412,128.984916,34.950968,64.35504,1.0,0.0,0
2,37,1,,152.347592,45.913288,64.953147,0.0,1.0,0
3,55,1,192.058908,116.803684,67.208925,73.821382,0.0,,0
4,53,1,151.203449,107.017396,60.693838,81.121946,0.0,1.0,0
5,39,1,,153.880809,31.208614,79.589069,0.0,0.0,0
6,65,0,174.615666,114.029408,,85.529955,0.0,0.0,0
7,33,0,242.919402,147.951375,54.439475,77.331714,0.0,0.0,0
8,49,0,95.804359,83.304875,60.758929,77.630529,1.0,0.0,0
9,55,0,181.360943,106.011782,50.576747,87.588781,0.0,0.0,0


In [48]:
missing_data.isna().sum()      # give the information about how much values are missing in each column

age             0
sex             0
chol            4
Idl             5
hdl             2
bp              2
smoking         1
diabetes        2
heart_attack    0
dtype: int64

In [49]:
missing_data.dtypes

age               int64
sex               int64
chol            float64
Idl             float64
hdl             float64
bp              float64
smoking         float64
diabetes        float64
heart_attack      int64
dtype: object

In [86]:
# Fill the missing value
# 'chol' column
missing_data['chol'] = missing_data['chol'].fillna(0.0)     # fill the missing value with the mean of values

In [66]:
# 'Idl' column
missing_data['Idl'] = missing_data['Idl'].fillna(0.0)  

In [67]:
# 'hdl' column
missing_data['hdl'] =  missing_data['hdl'].fillna(0.1)  

In [68]:
# 'bp' column
missing_data['bp'] = missing_data['bp'].fillna(0.2)  

In [69]:
# 'smoking' column
missing_data['smoking'] = missing_data['smoking'].fillna(0.2)

In [70]:
# 'diabetes' column
missing_data['diabetes'] = missing_data['diabetes'].fillna(0.0)

In [71]:
missing_data.isna().sum() 

age             0
sex             0
chol            0
Idl             0
hdl             0
bp              0
smoking         0
diabetes        0
heart_attack    0
dtype: int64

In [94]:
# 2- Using Scilearn Method

In [95]:
missing_values = pd.read_csv('Missing Heart Attack Data.csv')
missing_values

Unnamed: 0,age,sex,chol,Idl,hdl,bp,smoking,diabetes,heart_attack
0,57,1,229.463642,175.879129,39.225687,91.37878,0.0,0.0,0
1,58,1,186.46412,128.984916,34.950968,64.35504,1.0,0.0,0
2,37,1,,152.347592,45.913288,64.953147,0.0,1.0,0
3,55,1,192.058908,116.803684,67.208925,73.821382,0.0,,0
4,53,1,151.203449,107.017396,60.693838,81.121946,0.0,1.0,0
5,39,1,,153.880809,31.208614,79.589069,0.0,0.0,0
6,65,0,174.615666,114.029408,,85.529955,0.0,0.0,0
7,33,0,242.919402,147.951375,54.439475,77.331714,0.0,0.0,0
8,49,0,95.804359,83.304875,60.758929,77.630529,1.0,0.0,0
9,55,0,181.360943,106.011782,50.576747,87.588781,0.0,0.0,0


In [96]:
missing_values.isna().sum()

age             0
sex             0
chol            4
Idl             5
hdl             2
bp              2
smoking         1
diabetes        2
heart_attack    0
dtype: int64

In [97]:
# drop the NaN value rows
missing_values = missing_values.dropna(subset = ['hdl'])   # It drops the NaN rows
missing_values.isna().sum()

age             0
sex             0
chol            4
Idl             5
hdl             0
bp              2
smoking         1
diabetes        2
heart_attack    0
dtype: int64

In [105]:
# Imputation

In [106]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Create the imputer
float_imputer = SimpleImputer(strategy='constant', fill_value=0.1)

# Define the features to impute
float_features = ['chol', 'Idl', 'hdl', 'bp', 'smoking', 'diabetes']

# Create the ColumnTransformer - NOTE: transformers must be in a list of tuples
imputer = ColumnTransformer([
    ('float_imputer', float_imputer, float_features)  # This is a tuple inside a list
])

# Fit and transform the data
filled_values = imputer.fit_transform(missing_values)
filled_values

array([[2.29463642e+02, 1.75879129e+02, 3.92256871e+01, 9.13787803e+01,
        0.00000000e+00, 0.00000000e+00],
       [1.86464120e+02, 1.28984916e+02, 3.49509676e+01, 6.43550402e+01,
        1.00000000e+00, 0.00000000e+00],
       [1.00000000e-01, 1.52347592e+02, 4.59132877e+01, 6.49531470e+01,
        0.00000000e+00, 1.00000000e+00],
       [1.92058908e+02, 1.16803684e+02, 6.72089250e+01, 7.38213818e+01,
        0.00000000e+00, 1.00000000e-01],
       [1.51203449e+02, 1.07017396e+02, 6.06938379e+01, 8.11219458e+01,
        0.00000000e+00, 1.00000000e+00],
       [1.00000000e-01, 1.53880809e+02, 3.12086140e+01, 7.95890690e+01,
        0.00000000e+00, 0.00000000e+00],
       [2.42919402e+02, 1.47951375e+02, 5.44394748e+01, 7.73317143e+01,
        0.00000000e+00, 0.00000000e+00],
       [9.58043591e+01, 8.33048749e+01, 6.07589289e+01, 7.76305293e+01,
        1.00000000e+00, 0.00000000e+00],
       [1.81360943e+02, 1.06011782e+02, 5.05767466e+01, 8.75887813e+01,
        0.00000000e+00, 

In [107]:
orgainzed_data = pd.DataFrame(filled_values, columns= ['chol', 'Idl', 'hdl', 'bp', 'smoking', 'diabetes'])
orgainzed_data

Unnamed: 0,chol,Idl,hdl,bp,smoking,diabetes
0,229.463642,175.879129,39.225687,91.37878,0.0,0.0
1,186.46412,128.984916,34.950968,64.35504,1.0,0.0
2,0.1,152.347592,45.913288,64.953147,0.0,1.0
3,192.058908,116.803684,67.208925,73.821382,0.0,0.1
4,151.203449,107.017396,60.693838,81.121946,0.0,1.0
5,0.1,153.880809,31.208614,79.589069,0.0,0.0
6,242.919402,147.951375,54.439475,77.331714,0.0,0.0
7,95.804359,83.304875,60.758929,77.630529,1.0,0.0
8,181.360943,106.011782,50.576747,87.588781,0.0,0.0
9,172.488842,108.373823,51.203626,69.573816,0.0,0.1
