In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn # machine learning functions

df = pd.read_csv('../input/soni2022/train.csv')
df

In [None]:
# Convert categorial data to numerical data
i = 0
while(i < len(df['Sex'])):
    if(df.iloc[i]['Sex'] == 'M'):
        df.at[i, 'Sex'] = 1
    elif(df.iloc[i]['Sex'] == 'I'):
        df.at[i, 'Sex'] = 0
    else:
        df.at[i, 'Sex'] = -1
    i += 1
df

In [None]:
import seaborn as sns #here we notice that for sex, there is a significantly different trend for animals of sex 'I' than for animals of sex 'M' or 'F'
sns.lmplot(x="Shell weight", y="Rings", hue="Sex", data=df);

In [None]:
# Create a new data column to capture the different correlation that appears between shell weight an animals of sex "I"
sex_diam = []
for j in range(len(df['Sex'])):
    if(df.iloc[j]['Sex'] == 0):
        sex_diam.append(df.iloc[j]['Shell weight'])
    else:
        sex_diam.append(0)
df['sex_diam'] = sex_diam

# The ratio between shell weight and diameter may affect ring count, for example, a 
# high shell weight relative to diameter may indicate high growth, or lots of rings
swd = []
for k in range(len(df['Shell weight'])):
    swd.append(df.iloc[k]['Shell weight']/df.iloc[k]['Diameter'])
df['sw/d'] = swd

df

In [None]:
features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'sex_diam', 'sw/d']
labels = ['Rings']
x = df[features]
y = df[labels]
y = np.ravel(y)
from sklearn.feature_selection import mutual_info_regression

# Mutual information tells us how much information each class provides, higher MI means higher information gained
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(x, y, 'auto')
mi_scores  # show a few features with their MI scores

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)

from sklearn.neural_network import MLPClassifier
# Overall, the data correlated poorly with number of rings, so I kept my le
neural_net = MLPClassifier(max_iter = 500, learning_rate_init = 0.002, learning_rate = 'invscaling', hidden_layer_sizes = (100, 50, 100, 25), random_state=1)
neural_net.fit(x, y)

In [None]:
# Get an idea of how our model performs
y_pred = neural_net.predict(x_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)

In [None]:
# After training, we follow the same process for testing
test_df = pd.read_csv('../input/soni2022/test.csv')

n = 0
while(n < len(test_df['Sex'])):
    if(test_df.iloc[n]['Sex'] == 'M'):
        test_df.at[n, 'Sex'] = 1
    elif(test_df.iloc[n]['Sex'] == 'F'):
        test_df.at[n, 'Sex'] = 0
    else:
        test_df.at[n, 'Sex'] = -1
    n += 1

sex_diam_test = []
for l in range(len(test_df['Sex'])):
    if(test_df.iloc[l]['Sex'] == 0):
        sex_diam_test.append(test_df.iloc[l]['Shell weight'])
    else:
        sex_diam_test.append(0)
test_df['sex_diam'] = sex_diam_test

swd_test = []
for m in range(len(test_df['Shell weight'])):
    swd_test.append(test_df.iloc[m]['Shell weight']/test_df.iloc[m]['Diameter'])
test_df['sw/d'] = swd_test


In [None]:
x_test_df = test_df[features]
y_pred_test = neural_net.predict(x_test_df)
y_pred_df = pd.DataFrame(test_df['id'], columns=['id'])
y_pred_df['Rings'] = y_pred_test
y_pred_df.reset_index(drop=True, inplace=True)
y_pred_df.to_csv('submission.csv',index=False)

In [None]:
print(y_pred_df)