In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix # confusion matrix
from sklearn.model_selection import learning_curve

# We can override the default matplotlib styles with those of Seaborn
import seaborn as sns
sns.set()

In [2]:
df = pd.read_csv("model_dataset.csv")

In [3]:
model_data = df.copy()

In [4]:
model_data.drop("Unnamed: 0", inplace=True, axis=1)

In [5]:
model_data

Unnamed: 0,charges,smoker_log,age_log,children_log,bmi_log
0,16884.92,0.693147,2.995732,0.000000,3.363842
1,1725.55,0.000000,2.944439,0.693147,3.548755
2,4449.46,0.000000,3.367296,1.386294,3.526361
3,21984.47,0.000000,3.526361,0.000000,3.165475
4,3866.86,0.000000,3.496508,0.000000,3.397189
...,...,...,...,...,...
1321,10600.55,0.000000,3.931826,1.386294,3.464798
1322,2205.98,0.000000,2.944439,0.000000,3.494080
1323,1629.83,0.000000,2.944439,0.000000,3.633631
1324,2007.94,0.000000,3.091042,0.000000,3.288402


In [6]:
X = model_data.iloc[:, 1:]
y = model_data.iloc[:, [0]]

In [19]:
X

Unnamed: 0,smoker_log,age_log,children_log,bmi_log
0,0.693147,2.995732,0.000000,3.363842
1,0.000000,2.944439,0.693147,3.548755
2,0.000000,3.367296,1.386294,3.526361
3,0.000000,3.526361,0.000000,3.165475
4,0.000000,3.496508,0.000000,3.397189
...,...,...,...,...
1321,0.000000,3.931826,1.386294,3.464798
1322,0.000000,2.944439,0.000000,3.494080
1323,0.000000,2.944439,0.000000,3.633631
1324,0.000000,3.091042,0.000000,3.288402


In [20]:
X_copy = X.copy()

In [21]:
X_copy 

Unnamed: 0,smoker_log,age_log,children_log,bmi_log
0,0.693147,2.995732,0.000000,3.363842
1,0.000000,2.944439,0.693147,3.548755
2,0.000000,3.367296,1.386294,3.526361
3,0.000000,3.526361,0.000000,3.165475
4,0.000000,3.496508,0.000000,3.397189
...,...,...,...,...
1321,0.000000,3.931826,1.386294,3.464798
1322,0.000000,2.944439,0.000000,3.494080
1323,0.000000,2.944439,0.000000,3.633631
1324,0.000000,3.091042,0.000000,3.288402


In [22]:
y

Unnamed: 0,charges
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86
...,...
1321,10600.55
1322,2205.98
1323,1629.83
1324,2007.94


In [7]:
X.shape

(1326, 4)

In [8]:
y.shape

(1326, 1)

## Feature Standardization

In [25]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

In [26]:
# Fit the input data (X)
scaler.fit(X_copy)

In [27]:
# Scale with the method 'transform()'
x_scaled = scaler.transform(X_copy)

In [28]:
x_scaled

array([[ 1.99623706, -1.66514591, -1.04373951, -0.3602442 ],
       [-0.50094251, -1.80057372,  0.19708798,  0.59626764],
       [-0.50094251, -0.68412022,  1.43791548,  0.48042674],
       ...,
       [-0.50094251, -1.80057372, -1.04373951,  1.03531074],
       [-0.50094251, -1.41350192, -1.04373951, -0.75047548],
       [ 1.99623706,  1.32205376, -1.04373951, -0.15495609]],
      shape=(1326, 4))

## Feature Selection

In [29]:
from sklearn.feature_selection import f_regression

f_regression(x_scaled,y)

  y = column_or_1d(y, warn=True)


(array([2124.17963289,  128.19296062,    9.68953042,   50.15223508]),
 array([1.78341958e-277, 1.96977359e-028, 1.89289367e-003, 2.30307909e-012]))

In [31]:
p_values = f_regression(X,y)[1]
p_values

  y = column_or_1d(y, warn=True)


array([1.78341958e-277, 1.96977359e-028, 1.89289367e-003, 2.30307909e-012])

In [36]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Fit the selector to the data
selector = SelectKBest(mutual_info_regression, k=3)
selector.fit(X_copy, y)

# Transform the data (get the selected features)
X_new = selector.transform(X_copy)

# Access feature scores
feature_scores = selector.scores_

# Check the selected features
selected_indices = selector.get_support(indices=True)


  y = column_or_1d(y, warn=True)


In [38]:
selected_indices

array([0, 1, 2])

In [None]:
# Get a boolean mask of selected features
selected_mask = selector.get_support()

# Get the indices of selected features
selected_indices = selector.get_support(indices=True)

In [39]:
# Assuming X_copy is a DataFrame with feature names
feature_names = X_copy.columns if isinstance(X_copy, pd.DataFrame) else [f"Feature {i}" for i in range(X_copy.shape[1])]

# Create a DataFrame for better visualization
scores_df = pd.DataFrame({
    "Feature": feature_names,
    "Score": feature_scores
})

# Sort by score
sorted_scores = scores_df.sort_values(by="Score", ascending=False)

# Display selected features and scores
selected_features = sorted_scores.iloc[selected_indices]
print(selected_features)

        Feature     Score
1       age_log  1.509494
0    smoker_log  0.362204
2  children_log  0.165517


In [None]:
# model = LinearRegression()
# model.fit(X,y)
# model.score(X,y)

# predictions = model.predict(X)

# plt.scatter(X, y)
# plt.plot(X, predictions, c = 'r')