In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [39]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [40]:
train_data.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [41]:
test_data.drop(columns = ['id'],inplace = True)


In [42]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])


In [44]:
test_data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,2,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,2,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,2,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,2,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,1,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [45]:
X = train_data.drop(columns = ['id','Rings'])
y = train_data['Rings']

In [46]:
from sklearn.model_selection import train_test_split
X_train , X_valid , y_train , y_valid = train_test_split(X,y,test_size = 0.33 , random_state = 42)

### Perform standard scaling to increase the accuracy

In [50]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)



#### 1.Lasso Regression
- The Lasso is a linear model that estimates sparse coefficients. It is useful in some contexts due to its tendency to prefer solutions with fewer non-zero coefficients, effectively reducing the number of features upon which the given solution is dependent. For this reason, Lasso and its variants are fundamental to the field of compressed sensing. Under certain conditions, it can recover the exact set of non-zero coefficients (see Compressive sensing: tomography reconstruction with L1 prior (Lasso)).

In [62]:
from sklearn import linear_model
model_1 = linear_model.Lasso(alpha=0.01)
model_1.fit(X_train,y_train)

In [63]:
train_score1 = model_1.score(X_train,y_train)
val_score1 = model_1.score(X_valid , y_valid)
print("Training score:" , train_score1)
print("Validation score:" , val_score1)

Training score: 0.5989265859163799
Validation score: 0.09326674589000694


In [64]:
y_pred_1 = model_1.predict(test_data)



In [55]:
submission.head()

Unnamed: 0,id,Rings
0,90615,10
1,90616,10
2,90617,10
3,90618,10
4,90619,10


In [65]:
from sklearn.metrics import mean_squared_log_error
rmsle_1 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_1, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_1:.4f}")

Root Mean Squared Log Error: 0.3044


In [66]:
y_pred_1

array([10.15682053, 10.29882707, 10.41149183, ..., 10.61451666,
       10.89341671, 10.16129586])

### Submission data to save

In [67]:
lasso = pd.DataFrame({'id': submission.id, 'Rings': y_pred_1})
lasso.to_csv('lasso.csv', index = False)

In [68]:
lasso.head()

Unnamed: 0,id,Rings
0,90615,10.156821
1,90616,10.298827
2,90617,10.411492
3,90618,10.471027
4,90619,10.13237


### 2. Linear Regression model

In [70]:
from sklearn.linear_model import LinearRegression

# Create a linear regression model
model_2 = LinearRegression()

# Train the model
model_2.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_2 = model_2.predict(test_data)

# Calculate RMSLE
rmsle_2 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_2, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_2:.4f}")

Root Mean Squared Log Error: 0.3214




In [71]:
train_score2 = model_2.score(X_train,y_train)
val_score2 = model_2.score(X_valid , y_valid)
print("Training score:" , train_score2)
print("Validation score:" , val_score2)

Training score: 0.6006659695414169
Validation score: 0.1007898872781392


In [72]:
submit_2 = pd.DataFrame({'id': submission.id, 'Rings': y_pred_2})
submit_2.to_csv('submit_2.csv', index = False)

### 3. Ridge Regression

In [73]:
model_3 = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model_3.fit(X_train,y_train)

In [74]:
y_pred_3 = model_3.predict(test_data)



In [75]:
rmsle_3 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_3, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_3:.4f}")

Root Mean Squared Log Error: 0.3214


In [76]:
train_score3 = model_3.score(X_train,y_train)
val_score3 = model_3.score(X_valid , y_valid)
print("Training score:" , train_score3)
print("Validation score:" , val_score3)

Training score: 0.6006659537976673
Validation score: 0.10083084605140469


In [77]:
submit_3 = pd.DataFrame({'id': submission.id, 'Rings': y_pred_1})
submit_3.to_csv('submit_3.csv', index = False)

#### 4. Ridge Regression

- Ridge regression addresses some of the problems of Ordinary Least Squares by imposing a penalty on the size of the coefficients. The ridge coefficients minimize a penalized residual sum of squares.

In [91]:
from sklearn import linear_model
model_4= linear_model.Ridge(alpha=0.001)
model_4.fit(X_train , y_train)

In [92]:
y_pred_4 = model_4.predict(test_data)
y_pred_4



array([10.83203185, 10.85523655, 10.90061707, ..., 11.31701483,
       11.75925372, 10.40318262])

In [93]:
rmsle_4 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_4, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_4:.4f}")

Root Mean Squared Log Error: 0.3214


In [94]:
train_score4 = model_4.score(X_train,y_train)
val_score4 = model_4.score(X_valid , y_valid)
print("Training score:" , train_score4)
print("Validation score:" , val_score4)

Training score: 0.6006659695414012
Validation score: 0.10078992849754687


In [95]:
submit_4 = pd.DataFrame({'id': submission.id, 'Rings': y_pred_1})
submit_4.to_csv('submit_4.csv', index = False)

### 5.Gradient Boosting Regressor

In [96]:
from sklearn.ensemble import GradientBoostingRegressor
model_5 = GradientBoostingRegressor(loss='squared_error', learning_rate=0.01, n_estimators=1000,
                                    min_samples_split=5,max_depth=10,alpha=0.09)
model_5.fit(X_train,y_train)



In [18]:
y_pred_5 = model_5.predict(test_data)

In [19]:
train_score5 = model_5.score(X_train,y_train)
val_score5 = model_5.score(X_valid , y_valid)
print("Training score:" , train_score5)
print("Validation score:" , val_score5)

Training score: 0.8411555898176964
Validation score: 0.6581063759048695


In [20]:
Gradient_boost = pd.DataFrame({'id': submission.id, 'Rings': y_pred_5})
Gradient_boost.to_csv('Gradient_boost.csv', index = False)

In [20]:
.shape

(60411, 2)

In [36]:
rmsle_5 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_5, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_5:.4f}")

Root Mean Squared Log Error: 0.3798


#### 6. XGboost Regressor

In [25]:
import xgboost as xgb


# Initialize XGBoost regressor
model_6 = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01,max_depth = 7, eta=0.1, 
                           subsample=0.7, colsample_bytree=0.8)

# Train the model
model_6.fit(X_train, y_train)

# Make predictions
y_pred_6 = model_6.predict(test_data)

In [35]:
train_score6 = model_6.score(X_train,y_train)
val_score6 = model_6.score(X_valid , y_valid)
print("Training score:" , train_score6)
print("Validation score:" , val_score6)

Training score: 0.7243829544678793
Validation score: 0.6631722384763301


In [37]:
rmsle_6 = np.sqrt(mean_squared_log_error(y[:60411], np.maximum(y_pred_6, 0)))  # Taking maximum to avoid negative predictions

print(f"Root Mean Squared Log Error: {rmsle_6:.4f}")

Root Mean Squared Log Error: 0.3774


In [26]:
Xgboost = pd.DataFrame({'id': submission.id, 'Rings': y_pred_6})
Xgboost.to_csv('Xgboost.csv', index = False)

In [27]:
Xgboost.head()

Unnamed: 0,id,Rings
0,90615,9.890112
1,90616,9.725107
2,90617,10.152843
3,90618,10.317695
4,90619,7.588178


In [32]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_6, X_train, y_train, cv=5)

In [33]:
scores

array([0.6585794 , 0.66620362, 0.65916533, 0.66987981, 0.66545502])

#### 7. LightGBM

In [10]:
!pip install lightgbm



In [28]:
import lightgbm as lgbm
model_7 =  lgbm.LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.01,
                              n_estimators=1000)
model_7.fit(X_train, y_train,eval_set = (X_valid , y_valid),eval_metric = 'l2')
y_pred_7 = model_7.predict(test_data)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1329
[LightGBM] [Info] Number of data points in the train set: 60712, number of used features: 8
[LightGBM] [Info] Start training from score 9.708064


In [29]:
train_score7 = model_7.score(X_train,y_train)
val_score7 = model_7.score(X_valid , y_valid)
print("Training score:" , train_score7)
print("Validation score:" , val_score7)

Training score: 0.6938602253653425
Validation score: 0.6593409486195073


In [30]:
lightgb = pd.DataFrame({'id': submission.id, 'Rings': y_pred_7})
lightgb.to_csv('lightgb.csv', index = False)

In [31]:
lightgb.head()

Unnamed: 0,id,Rings
0,90615,9.849242
1,90616,9.687024
2,90617,10.148168
3,90618,10.216908
4,90619,7.553012


### 8. Catboost Regressor

In [98]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.3-cp311-cp311-win_amd64.whl (101.1 MB)
   ---------------------------------------- 0.0/101.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.1 MB 1.9 MB/s eta 0:00:53
   ---------------------------------------- 0.1/101.1 MB 1.3 MB/s eta 0:01:18
   ---------------------------------------- 0.2/101.1 MB 1.6 MB/s eta 0:01:02
   ---------------------------------------- 0.3/101.1 MB 1.5 MB/s eta 0:01:06
   ---------------------------------------- 0.3/101.1 MB 1.5 MB/s eta 0:01:08
   ---------------------------------------- 0.5/101.1 MB 1.9 MB/s eta 0:00:53
   ---------------------------------------- 0.7/101.1 MB 2.4 MB/s eta 0:00:43
   ---------------------------------------- 1.0/101.1 MB 3.2 MB/s eta 0:00:32
    --------------------------------------- 1.5/

In [101]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

In [102]:
model = CatBoostRegressor()
parameters = {'depth': [6,8,10],
              'learning_rate':[0.01 , 0.05 , 0.1],
              'iterations': [200,150]}
grid = GridSearchCV(estimator = model,
                   param_grid = parameters , cv=2,
                   n_jobs = -1)
grid.fit(X_train,y_train)

7 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\catboost\core.py", line 5807, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\catboost\core.py"

0:	learn: 3.0084674	total: 422ms	remaining: 1m 23s
1:	learn: 2.8685084	total: 622ms	remaining: 1m 1s
2:	learn: 2.7435731	total: 657ms	remaining: 43.2s
3:	learn: 2.6359908	total: 693ms	remaining: 34s
4:	learn: 2.5404985	total: 704ms	remaining: 27.5s
5:	learn: 2.4585274	total: 714ms	remaining: 23.1s
6:	learn: 2.3869963	total: 726ms	remaining: 20s
7:	learn: 2.3253698	total: 737ms	remaining: 17.7s
8:	learn: 2.2717957	total: 750ms	remaining: 15.9s
9:	learn: 2.2282199	total: 761ms	remaining: 14.5s
10:	learn: 2.1881551	total: 775ms	remaining: 13.3s
11:	learn: 2.1526975	total: 789ms	remaining: 12.4s
12:	learn: 2.1232141	total: 824ms	remaining: 11.9s
13:	learn: 2.0981090	total: 834ms	remaining: 11.1s
14:	learn: 2.0751027	total: 845ms	remaining: 10.4s
15:	learn: 2.0548890	total: 857ms	remaining: 9.86s
16:	learn: 2.0384562	total: 913ms	remaining: 9.82s
17:	learn: 2.0241102	total: 923ms	remaining: 9.33s
18:	learn: 2.0101246	total: 946ms	remaining: 9.01s
19:	learn: 1.9989180	total: 957ms	remaining:

In [103]:
grid.best_params_

{'depth': 8, 'iterations': 200, 'learning_rate': 0.1}

In [104]:
model_8 = CatBoostRegressor(depth = 8 , iterations = 200 , learning_rate = 0.1)
model_8.fit(X_train,y_train)

0:	learn: 3.0084674	total: 14.6ms	remaining: 2.92s
1:	learn: 2.8685084	total: 26.8ms	remaining: 2.65s
2:	learn: 2.7435731	total: 38.7ms	remaining: 2.54s
3:	learn: 2.6359908	total: 54.1ms	remaining: 2.65s
4:	learn: 2.5404985	total: 66.4ms	remaining: 2.59s
5:	learn: 2.4585274	total: 78.5ms	remaining: 2.54s
6:	learn: 2.3869963	total: 90.8ms	remaining: 2.5s
7:	learn: 2.3253698	total: 102ms	remaining: 2.46s
8:	learn: 2.2717957	total: 114ms	remaining: 2.43s
9:	learn: 2.2282199	total: 126ms	remaining: 2.4s
10:	learn: 2.1881551	total: 140ms	remaining: 2.4s
11:	learn: 2.1526975	total: 151ms	remaining: 2.37s
12:	learn: 2.1232141	total: 163ms	remaining: 2.35s
13:	learn: 2.0981090	total: 173ms	remaining: 2.3s
14:	learn: 2.0751027	total: 184ms	remaining: 2.27s
15:	learn: 2.0548890	total: 196ms	remaining: 2.25s
16:	learn: 2.0384562	total: 207ms	remaining: 2.23s
17:	learn: 2.0241102	total: 219ms	remaining: 2.21s
18:	learn: 2.0101246	total: 231ms	remaining: 2.2s
19:	learn: 1.9989180	total: 242ms	remai

<catboost.core.CatBoostRegressor at 0x21743e45950>

In [106]:
test_data = scaler.fit_transform(test_data)

In [107]:
test_data

array([[ 1.19030653,  1.08472313,  0.74935358, ...,  1.35772434,
         1.42058358,  0.57288709],
       [ 1.19030653,  0.53203837,  0.59545832, ...,  0.67218583,
         0.49722834,  0.37646943],
       [ 1.19030653,  0.36198152,  0.18507096, ...,  0.05520117,
         0.14972906,  0.1107279 ],
       ...,
       [-0.07089855,  0.6170668 ,  0.39026464, ...,  0.25351767,
         0.3036216 ,  0.2994429 ],
       [-1.33210363,  1.21226577,  1.26233779, ...,  1.21082323,
         1.86736838,  1.60889394],
       [-1.33210363, -0.74338801, -0.63570377, ..., -0.92413956,
        -0.77859046, -1.04467008]])

In [108]:
y_pred_8 = model_8.predict(test_data)

In [109]:
y_pred_8

array([ 9.39642707,  9.80348662, 10.12645821, ..., 12.12335518,
       13.18912194,  9.64037322])

In [110]:
train_score8 = model_8.score(X_train,y_train)
val_score8 = model_8.score(X_valid , y_valid)
print("Training score:" , train_score8)
print("Validation score:" , val_score8)

Training score: 0.6888994909994421
Validation score: 0.0992766754052089


In [111]:
catboost = pd.DataFrame({'id': submission.id, 'Rings': y_pred_8})
catboost.to_csv('catboost.csv', index = False)

In [112]:
catboost.head()

Unnamed: 0,id,Rings
0,90615,9.396427
1,90616,9.803487
2,90617,10.126458
3,90618,9.990716
4,90619,7.629881
