Task 1 Load and split dataset

In [1]:
import pandas as pd
import kagglehub
# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")
print("Path to dataset files:",path)


Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices


In [2]:
import os
housing_data_path = os.path.join(path,"housing.csv")
df =pd.read_csv(housing_data_path)
df.info()
df.isnull().sum()
df=df.dropna(inplace=False)
df.isnull().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64


In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [16]:
# one hot encoding for ocean proximity
df=pd.get_dummies(df,columns=['ocean_proximity'],drop_first=True)
print('features after encoding')
df.head()

features after encoding


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,True,False


In [17]:
# defining features and target value
X=df.drop(columns='median_house_value')
y=df['median_house_value']

In [18]:
import sklearn.model_selection as model_selection
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (16346, 12)
Test set shape: (4087, 12)


In [28]:
# feature scaling
from sklearn.preprocessing import StandardScaler
x_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.transform(X_test)


Task2 Regression Task

Baseline Model

In [29]:
from sklearn.linear_model import LinearRegression
linear_model= LinearRegression()
linear_model.fit(X_train_scaled,y_train)

In [31]:
y_train_pred=linear_model.predict(X_train_scaled)
y_test_pred=linear_model.predict(X_test_scaled)

In [32]:
from numpy import test
from sklearn.metrics import mean_squared_error
# calculating MSE on scaled data
train_mse= mean_squared_error(y_train,y_train_pred)
test_mse= mean_squared_error(y_test,y_test_pred)
print(f'MSE on train data: {train_mse:3f}')
print(f'MSE on test data: {test_mse:4f}')

MSE on train data: 4690511174.839978
MSE on test data: 4802173538.604160


Hyperparameter tuning

In [33]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
alpha = {'alpha' : [0.01, 0.1, 1, 10, 100]}

#ridge
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, alpha, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)
best_ridge = ridge_cv.best_estimator_
print("Best Ridge alpha:", ridge_cv.best_params_)

#evaluate on test set
ridge_test_pred = best_ridge.predict(X_test_scaled)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)
print("Ridge Test MSE:", ridge_test_mse)

#lasso
from sklearn.linear_model import Lasso
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
lasso = Lasso(max_iter = 50000)
lasso_cv = GridSearchCV(lasso, alpha, cv=5, scoring='neg_mean_squared_error')
lasso_cv.fit(X_train_scaled, y_train)
best_lasso = lasso_cv.best_estimator_
print("Best Lasso alpha:", lasso_cv.best_params_)

#evaluate on test set
lasso_test_pred = best_lasso.predict(X_test_scaled)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)
print("Lasso Test MSE:", lasso_test_mse)


Best Ridge alpha: {'alpha': 10}
Ridge Test MSE: 4802318242.824235
Best Lasso alpha: {'alpha': 100}
Lasso Test MSE: 4803593426.282537


Regularization Experiments

In [34]:
coef_comparision = pd.DataFrame({'Feature': X.columns, 'Ridge Coeff': best_ridge.coef_, 'Lasso Coeff': best_lasso.coef_})
print(coef_comparision)
#evaluation of both models on training and testing mse
train_mse_ridge = mean_squared_error(y_train, best_ridge.predict(X_train_scaled))
test_mse_ridge = mean_squared_error(y_test, best_ridge.predict(X_test_scaled))
train_mse_lasso = mean_squared_error(y_train, best_lasso.predict(X_train_scaled))
test_mse_lasso = mean_squared_error(y_test, best_lasso.predict(X_test_scaled))
print("MSE on training ridge", train_mse_ridge)
print("MSE on training lasso", train_mse_lasso)
print("MSE on testing ridge", test_mse_ridge)
print("MSE on testing lasso", test_mse_lasso)

                       Feature   Ridge Coeff   Lasso Coeff
0                    longitude -53244.020320 -51459.125059
1                     latitude -53633.952168 -51878.857592
2           housing_median_age  13609.131922  13540.805008
3                  total_rooms -13257.046721 -11588.732400
4               total_bedrooms  42122.318053  40794.913867
5                   population -41003.666957 -40420.490483
6                   households  16739.263802  15886.616760
7                median_income  74439.102523  74065.302494
8       ocean_proximity_INLAND -18561.319500 -19098.830812
9       ocean_proximity_ISLAND   2900.246399   2812.287006
10    ocean_proximity_NEAR BAY  -1919.120335  -1654.694183
11  ocean_proximity_NEAR OCEAN   1128.824631   1167.657011
MSE on training ridge 4690611287.657703
MSE on training lasso 4691633517.3126745
MSE on testing ridge 4802318242.824235
MSE on testing lasso 4803593426.282537


Effect of regularization on the Bias-Variance Tradeoff


Regularization reduces variance by shrinking regression coefficients, thereby preventing overfitting and improving generalization, as observed through lower test MSE. Excessive regularization increases bias and leads to underfitting.

Part 2 Classification Task

Load and split dataset

In [37]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
data = load_breast_cancer()
type(data)
data.keys()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.info()
df['target'].value_counts()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212


In [38]:
from sklearn.model_selection import train_test_split
X_c, y_c = load_breast_cancer(return_X_y=True)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
print (X_train_c.shape)
print (X_test_c.shape)

(455, 30)
(114, 30)


In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

Baseline model

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#training the model
log = LogisticRegression(max_iter = 1000)
log.fit(X_train_c_scaled, y_train_c)

feature_names_c = data.feature_names

#making predictions
y_train_c_pred = log.predict(X_train_c_scaled)
y_test_c_pred = log.predict(X_test_c_scaled)

#model coefficients
coef_c = pd.DataFrame({
    'feature': feature_names_c,
    'coefficient': log.coef_[0]}).sort_values(by='coefficient',ascending = False)

coef_c.head()

#computing accuracy
train_accuracy_c = accuracy_score(y_train_c, y_train_c_pred)
print("Training Accuracy:", train_accuracy_c)

test_accuracy_c = accuracy_score(y_test_c, y_test_c_pred)
print("Test Accuracy:", test_accuracy_c)


Training Accuracy: 0.9868131868131869
Test Accuracy: 0.9736842105263158


Hyperparameter tuning

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid_c = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# ridge
log_ridge_c = LogisticRegression(penalty='l2',solver='liblinear',max_iter=1000)

# GridSearchCV
log_ridge_cv = GridSearchCV(estimator=log_ridge_c,param_grid=param_grid_c,cv=5, scoring='accuracy')
log_ridge_cv.fit(X_train_c_scaled, y_train_c)
best_log_ridge_c = log_ridge_cv.best_estimator_
print("Best ridge parameter:", log_ridge_cv.best_params_)

#lasso
log_lasso_c = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1000)

# GridSearchCV
log_lasso_cv = GridSearchCV(estimator=log_lasso_c,param_grid=param_grid_c,cv=5,scoring='accuracy')
log_lasso_cv.fit(X_train_c_scaled, y_train_c)
best_log_lasso_c = log_lasso_cv.best_estimator_
print("Best Lasso parameter:", log_lasso_cv.best_params_)




Best ridge parameter: {'C': 0.1}
Best Lasso parameter: {'C': 1}


Regularization Experiments

In [42]:
#evaluating on accuracy ridge:
y_test_c_pred_ridge = best_log_ridge_c.predict(X_test_c_scaled)
test_accuracy_ridge_c = accuracy_score(y_test_c, y_test_c_pred_ridge)
print("Ridge Test Accuracy:", test_accuracy_ridge_c)
y_train_c_pred_ridge = best_log_ridge_c.predict(X_train_c_scaled)
train_accuracy_ridge_c = accuracy_score(y_train_c, y_train_c_pred_ridge)
print("Ridge Train Accuracy:", train_accuracy_ridge_c)

#evaluating on accuracy lasso:
y_test_c_pred_lasso = best_log_lasso_c.predict(X_test_c_scaled)
test_accuracy_lasso_c = accuracy_score(y_test_c, y_test_c_pred_lasso)
print("Lasso Test Accuracy:", test_accuracy_lasso_c)
y_train_c_pred_lasso = best_log_lasso_c.predict(X_train_c_scaled)
train_accuracy_lasso_c = accuracy_score(y_train_c, y_train_c_pred_lasso)
print("Lasso Train Accuracy:", train_accuracy_lasso_c)


Ridge Test Accuracy: 0.9912280701754386
Ridge Train Accuracy: 0.9824175824175824
Lasso Test Accuracy: 0.9736842105263158
Lasso Train Accuracy: 0.989010989010989


Effect of Regularization on the Bias-Variance Tradeoff

Regularization reduces variance by constraining model coefficients, preventing overfitting and improving classification accuracy on unseen data. However, overly strong regularization increases bias and may reduce accuracy.