In [144]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [76]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['id']

data.head(5)

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [68]:
print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [69]:
# Check for missing values
missing_values = data.isnull().sum()

# Display the columns with missing values and their counts
print("Missing Values:")
print(missing_values)

Missing Values:
id                0
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64


In [70]:
Label_encoder = LabelEncoder()

data['Sex_encoded'] = Label_encoder.fit_transform(data['Sex'])
data.drop('Sex', axis=1, inplace=True)

data.head(5)

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings,Sex_encoded
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11,0
1,1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11,0
2,2,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6,1
3,3,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10,2
4,4,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9,1


In [81]:
Label_encoder = LabelEncoder()

test['Sex_encoded'] = Label_encoder.fit_transform(test['Sex'])
test.drop('Sex', axis=1, inplace=True)

test.head(5)

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Sex_encoded
0,90615,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005,2
1,90616,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275,2
2,90617,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405,2
3,90618,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235,2
4,90619,0.415,0.325,0.11,0.358,0.1575,0.067,0.105,1


In [71]:
for category, encoded_value in zip(Label_encoder.classes_, Label_encoder.transform(Label_encoder.classes_)):
    print(f"{category}: {encoded_value}")

F: 0
I: 1
M: 2


In [72]:
y = data['Rings']
X = data.drop('Rings', axis = 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

---

In [140]:
# Define the parameter grid
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # 'positive' is available for handling non-negative predictions
}

# Create the grid search object
grid_search = GridSearchCV(estimator=LinearRegression(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)


Best Parameters: {'fit_intercept': True, 'positive': False}
Best Score: 4.014944712623064


In [133]:
classifier = LinearRegression(fit_intercept=True, positive=False).fit(X_train, y_train)

In [134]:
negatives_exist = np.any(y_val < 0)
print(negatives_exist)

False


In [135]:
predictions = classifier.predict(X_val)

# Ensure predictions are non-negative
predictions = np.clip(predictions, 0, None)

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(np.log1p(predictions), np.log1p(y_val)))
print("RMSLE Score:", rmsle)

RMSLE Score: 0.05198265437947654


In [136]:
submission_preds = classifier.predict(test)

# Ensure submission predictions are non-negative
submission_preds = np.clip(submission_preds, 0, None)

# Round the predictions and convert them to integers
submission_preds_rounded = np.round(submission_preds).astype(int)

In [137]:
df = pd.DataFrame({'id': test_ids.values, 
                   'Rings': submission_preds_rounded
                   })

In [138]:
df.to_csv('submission1.2.csv', index=False)

---