In [106]:
import pandas as pd
import numpy as np
df= pd.read_csv("ML101_train_dataset.csv")
print(df.shape)
df_G= df.dropna(subset=['Gender'])
df_L = df.dropna(subset=['LifeStyle'])
df_G_L = df.dropna(subset=['LifeStyle'])
df_G_L=df_G_L.dropna(subset=['Gender'])

(100000, 12)


In [107]:
def preprocessing(df_G_L: pd.DataFrame):

    empty_counts = df_G_L.isna().sum(axis=1)
    rows_to_drop = empty_counts[empty_counts >= 3].index
    df_G_L = df_G_L.drop(rows_to_drop)
    df_G_L = df_G_L.reset_index(drop=True)
    df_G_L['Systolic BP'] = df_G_L['Systolic BP'].abs()
    df_G_L['Diastolic BP'] = df_G_L['Diastolic BP'].abs()
    # print(df_G_L.shape)
    df_G_L = df_G_L.interpolate()
    # print(df_G_L)
    def remove_outliers_iqr(data: pd.DataFrame, column):
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

    columns=["Average Daily Steps","Hours of Sleep","Caloric Intake","Age","Height","Weight","Cholesterol level","Blood Sugar level","Systolic BP","Diastolic BP"]
    for column in columns:
        remove_outliers_iqr(df_G_L,column)
    df_G_L.interpolate()
    return df_G_L

preprocessed_data=preprocessing(df_G_L)

# gender_mapping = {'Female': 0, 'Male': 1}
# preprocessed_data['Gender'] = preprocessed_data['Gender'].replace(gender_mapping)

preprocessed_data.to_csv('PPDATA.csv', index=False)


In [108]:
from scipy.stats import multivariate_normal
import math
from numpy import linalg as la
class BayesClassifier:
    def __init__(self, train_data: pd.DataFrame) -> None:
        '''Create a Bayes Classifier using train_data'''
        self.groupedData = {g: gdf.reset_index(drop=True) for g, gdf in train_data.groupby('LifeStyle')}
        self.nd = len(train_data.iloc[1, :]) - 1
        # print(self.groupedData)
        self.samplesize = len(train_data)
        # self.__balanceClasses()
        self.__createclasses()
    
    def __balanceClasses(self):
        # Find the minimum number of data points in a class
        min_class_size = min(len(class_data) for class_data in self.groupedData.values())
        
        # Downsample larger classes to match the size of the smallest class
        for cls in self.groupedData:
            self.groupedData[cls] = self.groupedData[cls].sample(n=min_class_size)

    def __createclasses(self) -> None:
        self.classes = {} 
        for grp in self.groupedData:
            grp_df = self.groupedData[grp].drop('LifeStyle', axis=1)
            self.classes[grp] = (len(grp_df), np.array(grp_df.mean()), np.array(grp_df.cov()))
    
    def __gaussian(self, mean: np.array, cov: np.array, x: np.array) -> float:
        num = math.exp(-0.5 * (x - mean).T.dot(la.inv(cov)).dot(x - mean))
        den = (((2 * math.pi) ** self.nd) * (la.det(cov))) ** 0.5
        return num/den
    
    def __gaussian2(self, mean: np.array, cov: np.array, x: np.array) -> float:
        mvn = multivariate_normal(mean, cov)
        return mvn.pdf(x)
        

    def classify(self, x) -> str:
        c_lp = []
        evidence = 0
        n = self.samplesize
        for clas in self.classes:
            l, m, cov = self.classes[clas]
            # likelihood = self.__gaussian(m, cov, x)
            likelihood = self.__gaussian2(m, cov, x)
            evidence += likelihood
            c_lp.append((clas, (l/n) * likelihood))
        
        c_pp = [(cl, lp/evidence) for cl, lp in c_lp]
        return max(c_pp, key= lambda x: x[1])[0]

In [109]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.metrics import mean_squared_error, r2_score

groupedData = {g: gdf.reset_index(drop=True) for g, gdf in preprocessed_data.groupby('Gender')}
mdata = groupedData['Male'].drop(['Gender'], axis=1)
fmdata = groupedData['Female'].drop(['Gender'], axis=1)

BC_M = BayesClassifier(mdata.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]])
BC_F = BayesClassifier(fmdata.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 10]])

# XM = mdata.iloc[:, :10]
# YM = mdata.iloc[:, 10]
# xmtrain, xmtest, ymtrain, ymtest = train_test_split(XM, YM, test_size=0.2, random_state=42)

# XFM = fmdata.iloc[:, :10]
# YFM = fmdata.iloc[:, 10]
# xfmtrain, xfmtest, yfmtrain, yfmtest = train_test_split(XFM, YFM, test_size=0.2, random_state=42)

# # X = preprocessed_data.iloc[:, :11]
# # Y = preprocessed_data.iloc[:, 11]
# # xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
# # grpbytarget = {g: gdf.reset_index(drop=True) for g, gdf in mdata.groupby('LifeStyle')}
# # test_size = 0.2
# # bad = grpbytarget['Bad']
# # good = grpbytarget['Good']
# # great = grpbytarget['Great']
# # avg = grpbytarget['Average']
# # bad_train, bad_test = train_test_split(bad, test_size=test_size)
# # good_train, good_test = train_test_split(good, test_size=test_size)
# # great_train, great_test = train_test_split(great, test_size=test_size)
# # avg_train, avg_test = train_test_split(avg, test_size=test_size)

# # mtraindata = pd.concat([bad_train, good_train, great_train, avg_train], axis=0)
# # mtestdata = pd.concat([bad_test, good_test, great_test, avg_test], axis=0)
# # mxtest = mtestdata.iloc[:, :10]
# # mytest = mtestdata.iloc[:, 10]

# # grpbytarget = {g: gdf.reset_index(drop=True) for g, gdf in fmdata.groupby('LifeStyle')}
# # test_size = 0.2
# # bad = grpbytarget['Bad']
# # good = grpbytarget['Good']
# # great = grpbytarget['Great']
# # avg = grpbytarget['Average']
# # bad_train, bad_test = train_test_split(bad, test_size=test_size)
# # good_train, good_test = train_test_split(good, test_size=test_size)
# # great_train, great_test = train_test_split(great, test_size=test_size)
# # avg_train, avg_test = train_test_split(avg, test_size=test_size)

# # fmtraindata = pd.concat([bad_train, good_train, great_train, avg_train], axis=0)
# # fmtestdata = pd.concat([bad_test, good_test, great_test, avg_test], axis=0)
# # fmxtest = fmtestdata.iloc[:, :10]
# # fmytest = fmtestdata.iloc[:, 10]
 

# BC_M = BayesClassifier(pd.concat([xmtrain, ymtrain], axis=1))
# BC_FM = BayesClassifier(pd.concat([xfmtrain, yfmtrain], axis=1))
# # BC = BayesClassifier(pd.concat([xtrain, ytrain], axis=1))

# # print(BC.classes)

# classifications_m = [BC_M.classify(np.array(x)[1:]) for x in xmtest.itertuples()]
# classifications_fm = [BC_FM.classify(np.array(x)[1:]) for x in xfmtest.itertuples()]
# # classifications = [BC.classify(np.array(x)[1:]) for x in xtest.itertuples()]
# corr_m = list(ymtest)
# corr_fm = list(yfmtest)
# # corr = list(ytest)

# acrr = 0
# for i in range(len(corr_m)):
#     if corr_m[i] == classifications_m[i]: acrr += 1

# print(acrr)

# for i in range(len(corr_fm)):
#     if corr_fm[i] == classifications_fm[i]: acrr += 1

# print(acrr)

# acrr /= (len(corr_fm) + len(corr_m))
# print(acrr)

# # newacrr = 0
# # for i in range(len(corr)):
# #     if corr[i] == classifications[i]: newacrr += 1

# # newacrr /= len(corr)
# # print(newacrr)







In [110]:
testdata = pd.read_csv('ML101_dataset_test_feature.csv')

groupedData = {g: gdf for g, gdf in testdata.groupby('Gender')}
mdata = groupedData['Male'].drop(['Gender'], axis=1)
fmdata = groupedData['Female'].drop(['Gender'], axis=1)

classM = [BC_M.classify(np.array(x)[1:]) for x in mdata.itertuples()]
classF = [BC_F.classify(np.array(x)[1:]) for x in fmdata.itertuples()]

classM_df = pd.DataFrame({'LifeStyle': classM}, index=mdata.index)
classF_df = pd.DataFrame({'LifeStyle': classF}, index=fmdata.index)



In [111]:
# print(classM_df)
# print(classF_df)

combined_predictions = pd.concat([classM_df, classF_df], axis=0).sort_index()

combined_predictions.to_csv('Pred.csv')

# print(combined_predictions)

     LifeStyle
0          Bad
17         Bad
19         Bad
22         Bad
24     Average
...        ...
4989       Bad
4991       Bad
4993       Bad
4994       Bad
4998       Bad

[2494 rows x 1 columns]
     LifeStyle
1          Bad
2      Average
3          Bad
4          Bad
5          Bad
...        ...
4992       Bad
4995       Bad
4996   Average
4997       Bad
4999       Bad

[2506 rows x 1 columns]
     LifeStyle
0          Bad
1          Bad
2      Average
3          Bad
4          Bad
...        ...
4995       Bad
4996   Average
4997       Bad
4998       Bad
4999       Bad

[5000 rows x 1 columns]


In [112]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import math

male_df = preprocessed_data[preprocessed_data['Gender'] == 'Male']
female_df = preprocessed_data[preprocessed_data['Gender'] == 'Female']
male_df=male_df.drop("Gender",axis=1)
female_df=female_df.drop("Gender",axis=1)

def regression(preprocessed_data):
    X = preprocessed_data[["Average Daily Steps","Hours of Sleep","Caloric Intake","Age","Height","Weight","Cholesterol level","Blood Sugar level"]]
    y = preprocessed_data[["Systolic BP","Diastolic BP"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    multioutput_model = MultiOutputRegressor(model)
    multioutput_model.fit(X_train, y_train)
    y_pred = multioutput_model.predict(X_test)
    mse1 = math.sqrt(mean_squared_error(y_test['Systolic BP'], y_pred[:, 0]))
    mse2 = math.sqrt(mean_squared_error(y_test['Diastolic BP'], y_pred[:, 1]))
    print(mse1,mse2)

    r2_1 = r2_score(y_test['Systolic BP'], y_pred[:, 0])
    r2_2 = r2_score(y_test['Diastolic BP'], y_pred[:, 1])
    print(r2_1,r2_2)
    

regression(male_df)
regression(female_df)




66.36383394308966 40.67656162354888
0.11742916749155341 0.06743296408614297
68.2005220600172 41.12989653045316
0.12538610561844543 0.07332030436581094


In [113]:

preprocessed_data['Gender'] = preprocessed_data['Gender'].replace({'Male': 0, 'Female': 1})
data = pd.read_csv('ML101_dataset_test_feature.csv')
data=data[["Average Daily Steps","Hours of Sleep","Caloric Intake","Age","Height",'Gender', "Weight","Cholesterol level","Blood Sugar level"]]
# print(X)
data['Gender'] = data['Gender'].replace({'Male': 0, 'Female': 1})
# print(data)
X = preprocessed_data[["Average Daily Steps","Hours of Sleep","Caloric Intake","Age","Height",'Gender', "Weight","Cholesterol level","Blood Sugar level"]]
# print(X)
y = preprocessed_data[["Systolic BP","Diastolic BP"]]
model = LinearRegression()
multioutput_model = MultiOutputRegressor(model) 
multioutput_model.fit(X,y)
prediction = multioutput_model.predict(data)
print(prediction.shape)
list=[]

prediction_df = pd.DataFrame(prediction, columns=["Predicted Systolic BP", "Predicted Diastolic BP"])
prediction_df['ID'] = range(len(prediction_df))
prediction_df.to_csv('predicted_results.csv', index=False)


(5000, 2)
