## Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import pickle

## Loading the Dataset

In [2]:
df = pd.read_csv("final_data1.csv")

# Data Analysis

### Displaying top 5 rows

In [3]:
df.head()

Unnamed: 0,Name,Gender,Age,Height_cm,Weight_kg,BMI,Obesity_Class,Vitamin D
0,Person1,1,38,170.8511,60.023285,20.562892,1,50.998751
1,Person2,0,38,186.016225,94.263091,27.242065,2,57.555479
2,Person3,1,44,150.005719,96.332186,42.81104,3,2.057119
3,Person4,0,14,165.116629,80.368788,29.478528,2,40.41191
4,Person5,1,44,157.337795,54.155006,21.87623,1,42.221793


### Displaying bottom 5 rows

In [4]:
df.tail()

Unnamed: 0,Name,Gender,Age,Height_cm,Weight_kg,BMI,Obesity_Class,Vitamin D
1495,Person1496,0,49,174.691233,108.532336,35.564518,3,31.408961
1496,Person1497,0,11,198.195888,64.990285,16.54471,0,91.671418
1497,Person1498,0,27,182.247084,80.767737,24.317378,1,37.358266
1498,Person1499,1,24,174.215789,61.793388,20.359494,1,48.444741
1499,Person1500,0,68,198.384762,87.036105,22.11479,1,25.464402


### Displaying the rows and columns

In [5]:
df.shape

(1500, 8)

In [6]:
#df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

### Displaying the data information

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           1500 non-null   object 
 1   Gender         1500 non-null   int64  
 2   Age            1500 non-null   int64  
 3   Height_cm      1500 non-null   float64
 4   Weight_kg      1500 non-null   float64
 5   BMI            1500 non-null   float64
 6   Obesity_Class  1500 non-null   int64  
 7   Vitamin D      1500 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 93.9+ KB


### Displaying Column Names

In [8]:
df.columns

Index(['Name', 'Gender', 'Age', 'Height_cm', 'Weight_kg', 'BMI',
       'Obesity_Class', 'Vitamin D'],
      dtype='object')

### Dispalying the Basic Statistics of Continuous Data

In [9]:
df.describe()

Unnamed: 0,Gender,Age,Height_cm,Weight_kg,BMI,Obesity_Class,Vitamin D
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.484,35.497333,175.153488,84.802305,28.203942,1.882,39.893597
std,0.499911,20.209955,14.516165,19.970876,8.134329,1.072463,19.643454
min,0.0,1.0,150.005719,50.072305,12.89407,0.0,-23.725754
25%,0.0,18.75,162.612629,67.536386,22.089594,1.0,26.90439
50%,0.0,36.0,175.376028,85.084444,27.480795,2.0,41.072871
75%,1.0,53.0,187.722242,101.936199,33.39277,3.0,53.187011
max,1.0,70.0,199.926025,119.991305,52.720153,3.0,114.192964


### Displaying the Basic Statistics of Categorical Data

In [10]:
print("Displaying the Categories in the Gender Column:")
display(df['Gender'].unique())
print("\nDisplaying each Category count in the Column:")
display(df['Gender'].value_counts())

Displaying the Categories in the Gender Column:


array([1, 0], dtype=int64)


Displaying each Category count in the Column:


Gender
0    774
1    726
Name: count, dtype: int64

In [11]:
print("Displaying the Categories in the Obesity_Class Column:")
display(df['Obesity_Class'].unique())
print("\nDisplaying each Category count in the Column:")
display(df['Obesity_Class'].value_counts())

Displaying the Categories in the Obesity_Class Column:


array([1, 2, 3, 0], dtype=int64)


Displaying each Category count in the Column:


Obesity_Class
3    597
1    400
2    316
0    187
Name: count, dtype: int64

### Displaying null values information

##### Displaying each column null values there or not

In [12]:
df.isnull().any()

Name             False
Gender           False
Age              False
Height_cm        False
Weight_kg        False
BMI              False
Obesity_Class    False
Vitamin D        False
dtype: bool

##### Displaying each column null values count

In [13]:
df.isnull().sum()

Name             0
Gender           0
Age              0
Height_cm        0
Weight_kg        0
BMI              0
Obesity_Class    0
Vitamin D        0
dtype: int64

##### Displaying whole dataset duplicates there or not

In [14]:
df.duplicated().any()

False

##### Displaying whole dataset duplicates count

In [15]:
df.duplicated().sum()

0

### Droping unncessary columns

In [16]:
df = df.drop(['Name'],axis=1)

### Selecting Features and Target Variable

In [17]:
X = df.drop(['Vitamin D'], axis=1)  # Dropping the specified columns along the columns axis
y = df['Vitamin D']  # Selecting only the 'Vitamin D' column as y

### Splitting Data into Train, Test Split

In [18]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets for both X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Check the shapes of the resulting arrays
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1200, 6)
Shape of X_test: (300, 6)
Shape of y_train: (1200,)
Shape of y_test: (300,)


# Applying Regression Models

In [19]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Displaying the metrics
print(f"R-squared (R²): {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

# Save the model
with open('rf_model.sav', 'wb') as file:
    # Serialize and save the model to the file
    pickle.dump(rf, file)

R-squared (R²): 0.7382756624353206
Mean Squared Error (MSE): 110.91225341775392
Root Mean Squared Error (RMSE): 10.53148866104664
Mean Absolute Error (MAE): 8.364280465431092


In [21]:
X.columns

Index(['Gender', 'Age', 'Height_cm', 'Weight_kg', 'BMI', 'Obesity_Class'], dtype='object')

In [22]:
import pandas as pd
import numpy as np
import pickle
with open('rf_model.sav', 'rb') as file:
    # Load the object stored in the file
    rf = pickle.load(file)
Gender = int(input("Enter Gender 0 or 1: "))
Age = int(input("Enter age: "))
Height_cm = float(input("Enter Height in cm: "))
Weight_kg = float(input("Enter Weight in kg: "))
BMI = float(input("Enter BMI: "))
Obesity_Class = int(input("Enter Obesity_Class 0,1,2,3: "))

d = {'Gender':[Gender],'Age':[Age],'Height_cm':[Height_cm],'Weight_kg':[Weight_kg],'BMI':[BMI],'Obesity_Class':[Obesity_Class]}
fea = pd.DataFrame(d)
# Final predictions
final_predictions = rf.predict(fea)
print("\nPredicted Vitamin D:",final_predictions[0])

if final_predictions[0] < 20:
    print("Deficient")
elif final_predictions[0] < 30:
    print("Insufficient")
elif final_predictions[0] <= 100:
    print("Sufficient")
else:
    print("Upper Safety Limit")


Predicted Vitamin D: 9.97774066580895
Deficient
