In [21]:
import pandas as ps
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import numpy as np

In [2]:
df = ps.read_csv("calories.csv")
df = df.drop('User_ID',axis='columns')
df.head()

Unnamed: 0,Calories
0,231.0
1,66.0
2,26.0
3,71.0
4,35.0


In [3]:
ds = ps.read_csv("exercise.csv")
ds.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [4]:
com = ps.concat([df,ds],axis='columns')
com.to_csv("Merged.csv",index=False)
com.head()

Unnamed: 0,Calories,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,231.0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,66.0,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,26.0,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,71.0,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,35.0,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [5]:
com.isnull().sum()

Calories      0
User_ID       0
Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [6]:
com.describe()

Unnamed: 0,Calories,User_ID,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,89.539533,14977360.0,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453
std,62.456978,2872851.0,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923
min,1.0,10001160.0,20.0,123.0,36.0,1.0,67.0,37.1
25%,35.0,12474190.0,28.0,164.0,63.0,8.0,88.0,39.6
50%,79.0,14997280.0,39.0,175.0,74.0,16.0,96.0,40.2
75%,138.0,17449280.0,56.0,185.0,87.0,23.0,103.0,40.6
max,314.0,19999650.0,79.0,222.0,132.0,30.0,128.0,41.5


#Outlier Removel

In [7]:
upper = com.Calories.mean()+3*com.Calories.std()
upper

276.91046720114866

In [8]:
lower = com.Calories.mean()-3*com.Calories.std()
lower

-97.83140053448201

In [9]:
com_outlier = com[(com.Calories>lower)&(com.Calories<upper)]
com_outlier.shape

(14993, 9)

In [10]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
com['Gender'] = label.fit_transform(com['Gender'])
com.head()#Male(1) and Female(0)

Unnamed: 0,Calories,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,231.0,14733363,1,68,190.0,94.0,29.0,105.0,40.8
1,66.0,14861698,0,20,166.0,60.0,14.0,94.0,40.3
2,26.0,11179863,1,69,179.0,79.0,5.0,88.0,38.7
3,71.0,16180408,0,34,179.0,71.0,13.0,100.0,40.5
4,35.0,17771927,0,27,154.0,58.0,10.0,81.0,39.8


In [11]:
com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Calories    15000 non-null  float64
 1   User_ID     15000 non-null  int64  
 2   Gender      15000 non-null  int32  
 3   Age         15000 non-null  int64  
 4   Height      15000 non-null  float64
 5   Weight      15000 non-null  float64
 6   Duration    15000 non-null  float64
 7   Heart_Rate  15000 non-null  float64
 8   Body_Temp   15000 non-null  float64
dtypes: float64(6), int32(1), int64(2)
memory usage: 996.2 KB


In [12]:
from sklearn.model_selection import train_test_split
x = com.drop(['User_ID','Calories'],axis='columns')
y = com.Calories
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [13]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train,y_train)
reg.score(x_test,y_test)

0.966502924182346

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(LinearRegression(),x,y,cv=5)

array([0.96712832, 0.96658977, 0.96769213, 0.96828562, 0.96606908])

In [30]:
max_features = [0.3, 0.4, 0.5]
models = {
    'linear_regression_params' : {'model': LinearRegression(),'para':{'fit_intercept': [True,False],'copy_X': [False,True]}
},

'decision_tree_regression_params' : {'model':DecisionTreeRegressor(),'para':{
    'criterion': ['poisson','friedman_mse'],
    'max_depth': [None,1,2],
    'min_samples_split': [2,3],
    'min_samples_leaf': [1,2,3,4,5]
}
},

'lasso_regression_params' : {'model':Lasso(max_iter=10000),'para':{
    'alpha': [1.0,1.5,2.0,2.5],
    'copy_X': [True,False],
    'selection':['cyclic', 'random'],
    'random_state':[None,1,2]
}
},

'random_forest_regression_params' : {'model':RandomForestRegressor(),'para':{
    'n_estimators': [10,20,30,40],
    'criterion': ['poisson','squared_error'],
    'max_depth': [None,1,2,3],
    'min_samples_split': [2,3,4,5],
    'max_features': max_features
}
}
}

In [34]:
scores=[]
cv = ShuffleSplit(n_splits=5,test_size=0.20,random_state=0)
for model_name,model_para in models.items():
    gri = GridSearchCV(model_para['model'],model_para['para'],cv=cv,error_score='raise',return_train_score=False)
    gri.fit(x,y)
    scores.append({
        'model':model_name,
        'best_score':gri.best_score_,
        'best_para': gri.best_params_
    })
    print(gri.best_params_,gri.best_score_)


{'copy_X': False, 'fit_intercept': True} 0.9674331361229852
{'criterion': 'poisson', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2} 0.9933212229461791
{'alpha': 1.0, 'copy_X': True, 'random_state': 1, 'selection': 'random'} 0.964859278949918
{'criterion': 'squared_error', 'max_depth': None, 'max_features': 0.5, 'min_samples_split': 2, 'n_estimators': 40} 0.9972602914513542


In [35]:
data = ps.DataFrame(scores,columns=['model','best_score','best_para'])    
data

Unnamed: 0,model,best_score,best_para
0,linear_regression_params,0.967433,"{'copy_X': False, 'fit_intercept': True}"
1,decision_tree_regression_params,0.993321,"{'criterion': 'poisson', 'max_depth': None, 'm..."
2,lasso_regression_params,0.964859,"{'alpha': 1.0, 'copy_X': True, 'random_state':..."
3,random_forest_regression_params,0.99726,"{'criterion': 'squared_error', 'max_depth': No..."


In [36]:
reg = RandomForestRegressor(criterion='squared_error',max_depth=None,min_samples_split=3,n_estimators=40)
reg.fit(x,y)
reg.score(x_test,y_test)

0.9996426342815695

In [24]:
import pickle
with open('calories_predict.pickle','wb') as f:
    pickle.dump(reg,f)