In [146]:
import numpy as np
import pandas as pd

In [147]:
df = pd.DataFrame([
 [25, 4, 10, 1, 3.2],
 [30, 4, 20, 1,  5.8],
 [35, 6, 25, 1,  7.1],
 [40, 6, 30, 1,  8.5],
 [45, 8, 40, 1,  10.2],
 [50, 8, 45, 1,  11.3],
 [55, 10, 50, 1,  12.0]])


df


Unnamed: 0,0,1,2,3,4
0,25,4,10,1,3.2
1,30,4,20,1,5.8
2,35,6,25,1,7.1
3,40,6,30,1,8.5
4,45,8,40,1,10.2
5,50,8,45,1,11.3
6,55,10,50,1,12.0


In [148]:
y = df.iloc[:, 4].copy()
x = df.iloc[:, :4].copy()
print(x.shape)

print(y.size)

(7, 4)
7


In [149]:
## we have to form y = a1x1+ a2x2 + a3x3 + a4
def get_RSS(y, coeff, x):
    print(x.shape, y.shape, coeff.shape)
    return (y-(x*coeff))

def get_Residual(y, coeff, x):
    # Ensure all are numpy arrays
    if isinstance(x, pd.DataFrame):
        x = x.values
    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = y.values.reshape(-1, 1)

    out = x @ coeff   # matrix multiplication
    r = y - out
    return r

In [150]:
def find_coeff(x, y, max_iter=10000, tol=1e-6):
    X = np.asarray(x)
    Y = np.asarray(y).reshape(-1,1)

    coeff_old = np.zeros((X.shape[1],1))

    for itr in range(max_iter):
        try:
            r = get_Residual(Y, coeff_old, X)
            JTJ_inv = np.linalg.inv(X.T @ X)
            delta = JTJ_inv @ X.T @ r
            coeff_new = coeff_old + delta

            # if np.linalg.norm(delta) < tol:
            #     print(f"Converged at iteration {itr}")
            #     break

            coeff_old = coeff_new

        except np.linalg.LinAlgError:
            print("Matrix inversion failed — XᵗX may be singular.")
            break

    return coeff_old




# find_coeff(x, y)

# def find_coeff(x, y, max_iter=1000, tol=1e-6, lr=0.001):
#     coeff_old = np.zeros((x.shape[1], 1))
#     X = np.asarray(x)
#     Y = np.asarray(y).reshape(-1, 1)
    
#     for itr in range(max_iter):
#         try:
#             r = Y - X @ coeff_old
#             JTJ_inv = np.linalg.pinv(X.T @ X)
#             delta = JTJ_inv @ X.T @ r

#             coeff_new = coeff_old - lr * delta

#             if np.linalg.norm(delta) < tol:
#                 print(f"Converged at iteration {itr}")
#                 break

#             coeff_old = coeff_new

#         except np.linalg.LinAlgError:
#             print("Matrix inversion failed — XᵗX may be singular.")
#             break

#     return coeff_old


coeff_final = find_coeff(x,y)

print(coeff_final)

[[ 0.03636364]
 [-0.30909091]
 [ 0.23818182]
 [ 1.39090909]]


In [151]:
print(coeff_final)

def predict(x, coeff = coeff_final):
    return x.dot(coeff)

predict(np.array([50, 8, 45, 1]))

[[ 0.03636364]
 [-0.30909091]
 [ 0.23818182]
 [ 1.39090909]]


array([11.45454545])

In [152]:
from sklearn.feature_selection import f_regression



if isinstance(y, pd.Series):
    y = y.values
elif isinstance(y, pd.DataFrame):
    y = y.values.ravel()

# Compute F-statistics and p-values
F_values, p_values = f_regression(x, y)

# Make a nice table
if isinstance(x, pd.DataFrame):
    feature_names = x.columns
else:
    feature_names = [f"Feature_{i}" for i in range(x.shape[1])]

results = pd.DataFrame({
    'Feature': feature_names,
    'F_value': F_values,
    'p_value': p_values
})

# Sort by F-value (descending)
results = results.sort_values(by='F_value', ascending=False).reset_index(drop=True)

print(results)
print("\nMost important feature:", results.iloc[0]['Feature'])

print("Therefore the feature that is most important is : Equilibrium concentration (Ce)")

## high f test value suggest that the particular feature is highly correlated to the output
## p-value -> probality of seeing such a f-value by a mere chance
# therefore in this question f_value for feature->ce  = 488.76, that is highest among all others, and the corresponding p-value is very low
## therefore high f value shows that feature->ce is most significant to decide adsoption capacity, and the probablity of getting such a high value for f-test by a mere chance is very low
## which mean it has not happen by mere chance, therefore equilibrium concentration is the most signifiant feature to get the adsorption capacity


   Feature     F_value   p_value
0        2  488.768777  0.000004
1        0  200.666748  0.000032
2        1   39.599869  0.001490
3        3    0.000000  1.000000

Most important feature: 2.0
Therefore the feature that is most important is : Equilibrium concentration (Ce)


In [154]:
# for verifying

from sklearn.linear_model import LinearRegression
# print(x)

x_ = x.iloc[:, :3].copy()
print(x_)

model_ = LinearRegression()
print(x_.shape, y.shape)
model_.fit(x_, y)
print(model_.coef_, model_.intercept_)
sample = np.array([50, 8, 45]).reshape(1, -1)
prediction = model_.predict(sample)
print(prediction)

    0   1   2
0  25   4  10
1  30   4  20
2  35   6  25
3  40   6  30
4  45   8  40
5  50   8  45
6  55  10  50
(7, 3) (7,)
[ 0.03636364 -0.30909091  0.23818182] 1.3909090909090809
[11.45454545]
