Importing Libraries & Loading Dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = '/content/anemia dataset - Sheet2(1).csv'
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
print(df.head())


        UHID    LABID  AGE GENDER   OLD   NEW  DURATION
0   30303481  3291615   45      M  7.81   8.9        32
1   30303481  3291615   45      M  8.90   9.4        15
2  302248884  3291559   47      M  7.20   8.1        28
3  302248884  3291559   47      M  8.10   9.2        32
4   30193371  3291238   61      M  9.20  10.0        22


Data Preprocessing

In [2]:
# Convert 'GENDER' to numerical representation (M: 0, F: 1)
df['GENDER'] = df['GENDER'].map({'M': 0, 'F': 1})

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

print(df.head())

UHID        0
LABID       0
AGE         0
GENDER      2
OLD         0
NEW         0
DURATION    0
dtype: int64
        UHID    LABID  AGE  GENDER   OLD   NEW  DURATION
0   30303481  3291615   45     0.0  7.81   8.9        32
1   30303481  3291615   45     0.0  8.90   9.4        15
2  302248884  3291559   47     0.0  7.20   8.1        28
3  302248884  3291559   47     0.0  8.10   9.2        32
4   30193371  3291238   61     0.0  9.20  10.0        22


In [3]:
# Display data types of each column
print(df.dtypes)

UHID          int64
LABID         int64
AGE           int64
GENDER      float64
OLD         float64
NEW         float64
DURATION      int64
dtype: object


Categorical Variable Handling

In [4]:
# Convert 'GENDER' to one-hot encoding
df = pd.get_dummies(df, columns=['GENDER'], drop_first=True)

Scaling Features

In [None]:
# from sklearn.preprocessing import StandardScaler

# # scale 'AGE', 'OLD', 'NEW', and 'DURATION'
# columns_to_scale = ['AGE', 'OLD', 'NEW', 'DURATION']

# scaler = StandardScaler()
# df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


Outlier Detection and Removal

In [None]:
# from scipy.stats import zscore
# # Remove outliers using z-score
# z_scores = np.abs(zscore(df[columns_to_scale]))
# df_no_outliers = df[(z_scores < 3).all(axis=1)]


Handling Missing Values

In [None]:
# Impute missing values with mean or median
df.fillna(df.mean(), inplace=True)


Final Check and Summary

In [5]:
# Check for any remaining missing values
print(df.isnull().sum())

# Display summary
print(df.info())


UHID          0
LABID         0
AGE           0
OLD           0
NEW           0
DURATION      0
GENDER_1.0    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 341 entries, 0 to 342
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   UHID        341 non-null    int64  
 1   LABID       341 non-null    int64  
 2   AGE         341 non-null    int64  
 3   OLD         341 non-null    float64
 4   NEW         341 non-null    float64
 5   DURATION    341 non-null    int64  
 6   GENDER_1.0  341 non-null    bool   
dtypes: bool(1), float64(2), int64(4)
memory usage: 19.0 KB
None


Feature Engineering

In [6]:
# Identify non-numeric values in the 'NEW' column
non_numeric_new = df['NEW'][pd.to_numeric(df['NEW'], errors='coerce').isnull()]

# Display the rows with non-numeric 'NEW' values
print("Rows with non-numeric 'NEW' values:")
print(df.loc[non_numeric_new.index])

# Convert 'NEW' to numeric, replacing non-numeric values with NaN
df['NEW'] = pd.to_numeric(df['NEW'], errors='coerce')

# Check for missing values after conversion
print(df.isnull().sum())

# Calculate the rate of change in hemoglobin levels
df['HEMO_CHANGE_RATE'] = (df['NEW'] - df['OLD']) / df['DURATION']

# Display the dataset with the new feature
print(df.head())


Rows with non-numeric 'NEW' values:
Empty DataFrame
Columns: [UHID, LABID, AGE, OLD, NEW, DURATION, GENDER_1.0]
Index: []
UHID          0
LABID         0
AGE           0
OLD           0
NEW           0
DURATION      0
GENDER_1.0    0
dtype: int64
        UHID    LABID  AGE   OLD   NEW  DURATION  GENDER_1.0  HEMO_CHANGE_RATE
0   30303481  3291615   45  7.81   8.9        32       False          0.034063
1   30303481  3291615   45  8.90   9.4        15       False          0.033333
2  302248884  3291559   47  7.20   8.1        28       False          0.032143
3  302248884  3291559   47  8.10   9.2        32       False          0.034375
4   30193371  3291238   61  9.20  10.0        22       False          0.036364


Exploratory Data Analysis (EDA)

Machine Learning Model

Random forest

In [None]:
# # Randomized Search for Hyperparameter Tuning
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# X = df.drop(['HEMO_CHANGE_RATE'], axis=1)
# y = df['HEMO_CHANGE_RATE']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# param_dist = {'max_depth': [5, 10, 15, 20], 'min_samples_split': [2, 5, 10], 'n_estimators': [50, 100, 150, 200]}
# random_search = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist, n_iter=10, cv=5)
# random_search.fit(X, y)

# # Best Hyperparameters
# best_params_rf = random_search.best_params_

# # Model with Best Hyperparameters
# rf_model = RandomForestRegressor(**best_params_rf)
# rf_model.fit(X, y)

# # Evaluation
# rf_score = rf_model.score(X_test, y_test)
# print(f'Model Score (Random Forest): {rf_score}')


Linear Regression

In [7]:
# Initialize Linear Regression model
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X = df.drop(['NEW'], axis=1)
y = df['NEW']
lr_model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
#mse = mean_squared_error(y_test, y_pred)
#mae = mean_absolute_error(y_test, y_pred)

print("Linear Regression Model Evaluation:")
print(f"R^2 Score: {r2:.4f}")
#print(f"Mean Squared Error: {mse:.4f}")
#print(f"Mean Absolute Error: {mae:.4f}")

# Combine predictions with input features and true labels
#results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Exporting results to Excel
#excel_file_path = 'linear_regression_prediction_results.xlsx'
#results_df.to_excel(excel_file_path, index=False)

#print("Prediction results exported to Excel successfully.")

Linear Regression Model Evaluation:
R^2 Score: 0.9979


gradient boosting

In [None]:
# from sklearn.model_selection import train_test_split
# #from sklearn.linear_model import LinearRegression, Ridge, Lasso
# #from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
# from sklearn.metrics import r2_score, mean_squared_error

# # Assuming 'HEMO_CHANGE_RATE' is your target variable
# X = df.drop(['HEMO_CHANGE_RATE'], axis=1)
# y = df['HEMO_CHANGE_RATE']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {
#     #'Decision Tree': DecisionTreeRegressor(),
#     #'Random Forest': RandomForestRegressor(),
#     'Gradient Boosting': GradientBoostingRegressor(),
#     #'Extra Trees': ExtraTreesRegressor()  # Add Extra Trees Regressor
# }

# # Train and evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     r2 = r2_score(y_test, y_pred)
#     #mse = mean_squared_error(y_test, y_pred)

#     print(f"Model: {name}")
#     print(f"R^2 Score: {r2:.4f}")
#     #print(f"Mean Squared Error: {mse:.4f}")





Descision Tree

In [None]:
# from sklearn.model_selection import train_test_split

# from sklearn.tree import DecisionTreeRegressor

# from sklearn.metrics import r2_score, mean_squared_error

# # Assuming 'HEMO_CHANGE_RATE' is your target variable
# X = df.drop(['HEMO_CHANGE_RATE'], axis=1)
# y = df['HEMO_CHANGE_RATE']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {
#     'Decision Tree': DecisionTreeRegressor(),

# }

# # Train and evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     r2 = r2_score(y_test, y_pred)


#     print(f"Model: {name}")
#     print(f"R^2 Score: {r2:.4f}")



Extra TREES

In [None]:
# from sklearn.model_selection import train_test_split


# from sklearn.ensemble import  ExtraTreesRegressor
# from sklearn.metrics import r2_score, mean_squared_error

# # Assuming 'HEMO_CHANGE_RATE' is your target variable
# X = df.drop(['HEMO_CHANGE_RATE'], axis=1)
# y = df['HEMO_CHANGE_RATE']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {

#     'Extra Trees': ExtraTreesRegressor()  # Add Extra Trees Regressor
# }

# # Train and evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     r2 = r2_score(y_test, y_pred)


#     print(f"Model: {name}")
#     print(f"R^2 Score: {r2:.4f}")



model1

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
# from sklearn.metrics import r2_score, mean_squared_error

# # Assuming 'HEMO_CHANGE_RATE' is your target variable
# X = df.drop(['HEMO_CHANGE_RATE'], axis=1)
# y = df['HEMO_CHANGE_RATE']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {
#     'Decision Tree': DecisionTreeRegressor(),
#     'Random Forest': RandomForestRegressor(),
#     'Gradient Boosting': GradientBoostingRegressor(),
#     'Extra Trees': ExtraTreesRegressor()  # Add Extra Trees Regressor
# }

# # Train and evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     r2 = r2_score(y_test, y_pred)
#     mse = mean_squared_error(y_test, y_pred)

#     print(f"Model: {name}")
#     print(f"R^2 Score: {r2:.4f}")
#     print(f"Mean Squared Error: {mse:.4f}")


model2

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
# from sklearn.metrics import r2_score, mean_squared_error

# # Assuming 'HEMO_CHANGE_RATE' is your target variable
# X = df.drop(['NEW'], axis=1)
# y = df['NEW']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {
#     'Linear Regression': LinearRegression(),
#     'Ridge Regression': Ridge(),
#     'Lasso Regression': Lasso(),
#     'Decision Tree': DecisionTreeRegressor(),
#     'Random Forest': RandomForestRegressor(),
#     'Gradient Boosting': GradientBoostingRegressor(),
#     'Extra Trees': ExtraTreesRegressor()  # Add Extra Trees Regressor
# }

# # Train and evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     r2 = r2_score(y_test, y_pred)
#     mse = mean_squared_error(y_test, y_pred)

#     print(f"Model: {name}")
#     print(f"R^2 Score: {r2:.4f}")
#     print(f"Mean Squared Error: {mse:.4f}")

Model: Linear Regression
R^2 Score: 0.9979
Mean Squared Error: 0.0037
Model: Ridge Regression
R^2 Score: 0.9880
Mean Squared Error: 0.0215
Model: Lasso Regression
R^2 Score: 0.6219
Mean Squared Error: 0.6763
Model: Decision Tree
R^2 Score: 0.9690
Mean Squared Error: 0.0554


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Model: Random Forest
R^2 Score: 0.9883
Mean Squared Error: 0.0208
Model: Gradient Boosting
R^2 Score: 0.9892
Mean Squared Error: 0.0193
Model: Extra Trees
R^2 Score: 0.9934
Mean Squared Error: 0.0118


In [8]:
import joblib

# Save the model
joblib.dump(lr_model, 'trained_model_nlr1.pkl')


['trained_model_nlr1.pkl']

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
model_file_path = '/content/trained_model_nlr1.pkl'

In [11]:
joblib.dump(lr_model, model_file_path)

['/content/trained_model_nlr1.pkl']