In [1]:
# Activity 2
# Multiple Linear Regression

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('E:heart_cleveland_upload.csv')

# Prepare the features and target variables
X = data.drop('age', axis=1)
y = data['age']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Calculate the mean squared error
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

print("Mean Squared Error (Training Set):", train_mse)
print("Mean Squared Error (Testing Set):", test_mse)





Mean Squared Error (Training Set): 54.144609187935615
Mean Squared Error (Testing Set): 59.356214788266065


In [None]:
 We calculate the mean squared error (MSE) 
 by comparing the actual target values with the predicted values. 
 The MSE measures the average squared difference 
 between the predicted and actual values. 
 The MSE for the training and 
 testing sets is printed to the console.

In [3]:
# Multicolinearity

import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the dataset
data = pd.read_csv('E:heart_cleveland_upload.csv')

# Prepare the features and target variables
X = data.drop('age', axis=1)

# Calculate VIF for each independent variable
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif)



      Feature        VIF
0         sex   3.802781
1          cp   7.321667
2    trestbps  43.176799
3        chol  23.895659
4         fbs   1.260130
5     restecg   2.186745
6     thalach  36.474242
7       exang   2.073729
8     oldpeak   3.225743
9       slope   3.117972
10         ca   2.130740
11       thal   2.791270
12  condition   3.987740


In [None]:

We then calculate the VIF for each independent variable 
using the variance_inflation_factor function 
from the statsmodels.stats.outliers_influence module.
The VIF is calculated by providing the feature values (X.values)
and the index of the current variable in the loop (i) 
to the variance_inflation_factor function.
The results are stored in a DataFrame (vif), 
where each row represents an independent variable
and its corresponding VIF.
Finally, we print the DataFrame to display
the VIF values for each independent variable.

In [4]:
# Model Selection

import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load the dataset
data = pd.read_csv('E:heart_cleveland_upload.csv')

# Prepare the features and target variables
X = data.drop('age', axis=1)
y = data['age']

# Add a constant term to the features
X = sm.add_constant(X)

# Perform forward selection
selected_features = []
p_values = []
num_features = X.shape[1] - 1  # Exclude the constant term

for _ in range(num_features):
    best_pvalue = np.inf
    best_feature = None
    
    for feature in X.columns:
        if feature not in selected_features:
            model = sm.OLS(y, X[selected_features + [feature]]).fit()
            pvalue = model.pvalues[feature]
            
            if pvalue < best_pvalue:
                best_pvalue = pvalue
                best_feature = feature
    
    if best_feature is not None:
        selected_features.append(best_feature)
        p_values.append(best_pvalue)

# Print selected features and their p-values
result = pd.DataFrame({"Feature": selected_features, "P-value": p_values})
print(result)




      Feature        P-value
0       const  6.409317e-235
1     thalach   1.670125e-12
2    trestbps   2.259571e-07
3          ca   1.373979e-06
4        chol   3.686522e-03
5       exang   5.588038e-02
6         sex   1.013356e-01
7         fbs   2.791351e-01
8     restecg   3.466834e-01
9          cp   4.388094e-01
10  condition   6.297261e-01
11      slope   8.174603e-01
12       thal   8.314490e-01


In [None]:
we perform forward selection to select the best features.
In each iteration, we iterate through the available features 
and fit a model with the selected features plus one additional feature.
We compute the p-value for the additional feature 
and keep track of the feature with the lowest p-value.
The feature with the lowest p-value is added to 
the selected features list. 
This process is repeated until all features have been evaluated.
Finally, we print the selected features and
their corresponding p-values in a DataFrame (result).