In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso



### Taking input of different files into different dataframes and combining them

In [None]:
dataframe1 = pd.read_csv('./calories.csv')
dataframe1.head()

In [None]:
dataframe2 = pd.read_csv('./exercise.csv')
dataframe2.head()

#### Final dataframe is "dataframe"

In [None]:
dataframe2["Calories"]=dataframe1["Calories"]
dataframe=dataframe2.copy()
dataframe

### Dropping coloumns which are not necessary Eg: User_ID  

In [None]:
print(f"shape: {dataframe.shape}")
dataframe.info()
dataframe=dataframe.drop(["User_ID"], axis=1) #Dropped user id because there is no relation between User's ID and prediction of calories
dataframe.info()


Checking the datframe if it has null values

In [None]:
print(dataframe.isnull().sum())

### Correlation matrix of all features

In [None]:
dataframe.corr

In [None]:
# styled_df = dataframe.style.background_gradient(cmap='coolwarm')
# styled_df

In [None]:
#checking the relation between height and weight
sb.scatterplot(x='Height', y='Weight', data=dataframe) 
plt.show()

## plot between all features and calories

In [None]:
listoffeatures=[]
for col in dataframe.columns:
    listoffeatures.append(col)
Feat = listoffeatures[1:-1]

for i, col in enumerate(Feat):
    x = dataframe.sample(1000)
    sb.scatterplot(x=col, y='Calories', data=x)
    plt.show()


In [None]:
# features = dataframe.select_dtypes(include='float').columns

# plt.subplots(figsize=(15, 10))
# for i, col in enumerate(features):
#     plt.subplot(2, 3, i + 1)
#     sb.distplot(dataframe[col])
# plt.tight_layout()
# plt.show()

## All Pair plots

In [None]:
# import seaborn
sb.pairplot(dataframe)

In [None]:
dataframe

### changing the categorical values to numerical values using categorical.codes

In [None]:
dataframe["Gender"]=pd.Categorical(dataframe["Gender"]).codes
dataframe

## HEATMAP of correlation

In [None]:
plt.figure(figsize=(8, 8))
sb.heatmap(dataframe.corr() >0.9,
           annot=True,
           cbar=True)
plt.show()

### As we can see 
####             1.   Height and Weight are highly correlated
####             2.   Body_Temp and Duration are highly correlated
### So dropping Weight and Duration wont change any prediction outcomes because Height and Body_temp will have the required information about Weight and duration because they are higly correlated
### (Excluded because there are less no. of features.)
### Tested including and without including and took the best out of both


In [None]:
# dataframe.drop(['Weight', 'Duration'], axis=1, inplace=True)
dataframe

## creating X and y

In [None]:
X=dataframe.drop(["Calories"], axis=1)
y=dataframe["Calories"]
# X
# y

## train and test split 80:20

In [None]:
X_train, X_test ,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## LinearRegression

In [None]:
from sklearn.metrics import mean_squared_error

model_linear=LinearRegression()
model_linear.fit(X_train, y_train)

In [None]:
trainprediction_linear=model_linear.predict(X_train)
print(f"Training Error: {mean_squared_error(y_train, trainprediction_linear)}")

In [None]:
testprediction_linear=model_linear.predict(X_test)
print(f"Testing Error: {mean_squared_error(y_test, testprediction_linear)}")

In [None]:
from sklearn.metrics import *
print("For Linear model:")
mse = mean_squared_error(y_test, testprediction_linear)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, testprediction_linear)
r2 = r2_score(y_test, testprediction_linear)
r2Linear=r2
print("mse: ", mse)
print("rmse: ",rmse)
print("mae: ",mae)
print("r2: ",r2)




In [None]:
plt.figure(figsize=(12, 12))
plt.title("Predicted vs. True Output For Linear Model (96.7% accuracy)")
plt.scatter(testprediction_linear, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

## Lasso Regression

In [None]:
#Lasso
model_lasso=Lasso()
model_lasso.fit(X_train, y_train)


In [None]:
trainprediction_lasso=model_lasso.predict(X_train)
print(f"Training Error: {mean_squared_error(y_train, trainprediction_lasso)}")


In [None]:
testprediction_lasso=model_lasso.predict(X_test)
print(f"Testing Error: {mean_squared_error(y_test, testprediction_lasso)}")

In [None]:
print("For Lasso model:")
mse = mean_squared_error(y_test, testprediction_lasso)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, testprediction_lasso)
r2 = r2_score(y_test, testprediction_linear)
r2_lasso = r2
print("mse: ", mse)
print("rmse: ",rmse)
print("mae: ",mae)
print("r2: ",r2)

In [None]:
plt.figure(figsize=(12, 12))
plt.title("Predicted vs. True Output For Lasso Model (96.7 %)")
plt.scatter(testprediction_lasso, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

## RIDGE

In [None]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)


In [None]:
testprediction_ridge = ridge_model.predict(X_test)
trainprediction_ridge = ridge_model.predict(X_train)
mse = mean_squared_error(y_test, testprediction_ridge)
mse1 = mean_squared_error(y_train,trainprediction_ridge)
r2_ridge=r2_score(y_test, testprediction_ridge)
print(f"Mean Squared Error for train: {mse1:.4f}")
print(f"Mean Squared Error for test: {mse:.4f}")
print(f"Model Coefficients: {ridge_model.coef_}")
print(f"Intercept: {ridge_model.intercept_}")

## Tuning parameter in ridge

In [None]:
rangeofpar=[0.10, 0.20, 0.30, 0.40, 0.50, 1, 1.2, 1.4, 2, 5, 11, 20, 20.5, 20.6 , 20.8, 21, 22 ,]
bestPar=100000000000
BestMseForTest=1000000000000
for ele in rangeofpar:
    model1 = Ridge(alpha=ele)
    model1.fit(X_train, y_train)
    y_pred = model1.predict(X_test)
    mse=mean_absolute_error(y_test, y_pred)
    if mse<BestMseForTest:
        bestPar=ele
        BestMseForTest=mse
print(bestPar)


## Training model with best parameter found

In [None]:
ridge_model = Ridge(alpha=20.8)
ridge_model.fit(X_train, y_train)

In [None]:
y_pred = ridge_model.predict(X_test)
y_pred1 = ridge_model.predict(X_train)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mse1 = mean_squared_error(y_train, y_pred1)
r2_tunedRidge=r2_score(y_test, y_pred)
print(f"Mean Squared Error for train: {mse1:.4f}")
print(f"Mean Squared Error for test: {mse:.4f}")
print(f"Model Coefficients: {ridge_model.coef_}")
print(f"Intercept: {ridge_model.intercept_}")
print(r2_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(12, 12))
plt.title("Predicted vs. True Output For Ridge Model with parameter 20.8 (96.7 %)")
plt.scatter(y_pred, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
model_regressor.fit(X_train, y_train)

In [None]:
trainprediction_random=model_regressor.predict(X_train)
testprediction_random=model_regressor.predict(X_test)
mse = mean_squared_error(y_test, testprediction_random)
mse1 = mean_squared_error(y_train,trainprediction_random)
print(f"Mean Squared Error for train: {mse1:.4f}")
print(f"Mean Squared Error for test: {mse:.4f}")
r2 = r2_score(y_test, testprediction_random)
r2_rfr=r2
print("r2:", r2)

## Using GridSearch to find best parameters for the random forest regressor 
*(We selected random forest regressor because of all the models we were testing this regressor had the most accuracy so we are trying to increase its accuracy a bit more)*

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_regressor = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=2)
model_regressor.fit(X_train, y_train)

In [None]:
trainprediction_random=model_regressor.predict(X_train)
testprediction_random=model_regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, testprediction_random)
mse1 = mean_squared_error(y_train,trainprediction_random)
r2_gs=r2_score(y_test, testprediction_random)
print(f"Mean Squared Error for train: {mse1:.4f}")
print(f"Mean Squared Error for test: {mse:.4f}")
print(r2_score(y_test, testprediction_random))
# the accuracy through r2score is 99.827% here which is ok here given the mean squared error is 1.1295 for a very large train test set, i.e., 15k

In [None]:
plt.figure(figsize=(12, 12))
plt.title("Predicted vs. True Output For RandomForestRegression (99.82 %)")
plt.scatter(testprediction_random, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()

## RandomForestRegressor with 5-fold Evaluation

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
model_regressor.fit(X_train, y_train)
trainprediction_random=model_regressor.predict(X_train)
testprediction_random=model_regressor.predict(X_test)
mse = mean_squared_error(y_test, testprediction_random)
mse1 = mean_squared_error(y_train,trainprediction_random)
r2_5f=r2_score(y_test, testprediction_random)
print(f"Mean Squared Error for train: {mse1:.4f}")
print(f"Mean Squared Error for test: {mse:.4f}")
print(r2_score(y_test, testprediction_random))

### taking same model and running multiple times (10) and averaging the output (Will reduce variance)

In [None]:
aggr=y_test-y_test
for i in range(1, 11):
    model_regressor = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=2)
    model_regressor.fit(X_train, y_train)
    testprediction_random=model_regressor.predict(X_test)
    aggr+=testprediction_random
aggr/=10
r2_mul=r2_score(y_test, aggr)
mse = mean_squared_error(y_test, aggr)
print(f"Mean Squared Error for test: {mse:.4f}")
print(r2_score(y_test, aggr))
plt.figure(figsize=(12, 12))
plt.title("Predicted vs. True Output For RandomForestRegression (multiple times)")
plt.scatter(aggr, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()
    
# aggr.shape

### Ensemble Learning using multiple RandomForestRegression models (3 models)

In [None]:
model1=RandomForestRegressor(n_estimators=250, random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=2)
model2=RandomForestRegressor(n_estimators=250, random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=2)
model3=RandomForestRegressor(n_estimators=250, random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=2)
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)


In [None]:
testprediction1=model1.predict(X_test)
testprediction2=model2.predict(X_test)
testprediction3=model3.predict(X_test)
trainprediction1=model1.predict(X_train)
trainprediction2=model2.predict(X_train)
trainprediction3=model3.predict(X_train)

testpredictionAggr=(testprediction1+testprediction2+testprediction3)/3
trainpredictionAggr=(trainprediction1+trainprediction2+trainprediction3)/3

r2_El=r2_score(y_test, testpredictionAggr)
mse = mean_squared_error(y_test, testpredictionAggr)
print(f"Mean Squared Error for test: {mse:.4f}")
print(r2_score(y_test, testpredictionAggr))
plt.figure(figsize=(17, 17))
plt.title("Predicted vs. True Output For RandomForestRegression (multiple times)")
plt.scatter(testpredictionAggr, y_test)
plt.xlabel("y_true Values for test input")
plt.ylabel("y_predicted Values for test input")
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.show()



## R2 Score comparision 

In [None]:
r2arr=[r2Linear,r2_lasso,r2_ridge,r2_rfr,r2_gs,r2_mul,r2_5f,r2_El]
# r2arr
lis=["Linear","Lasso","Ridge","RandomForestRegression","BestparameterRFR","SameModelMultiple","5-Fold","Ensemble"]
for i in range(len(r2arr)):
    print(f"{lis[i]}: {r2arr[i]}")