In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

df = pd.read_csv('fish_train.csv')

X = df.drop('Weight', axis=1)
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=85, stratify=X['Species']
)

mean_width = round(X_train['Width'].mean(), 3)

numeric_cols = X_train.drop('Species', axis=1).columns
corr = X_train[numeric_cols].corr().abs()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr_vals = corr.where(~mask)
top3 = corr_vals.unstack().sort_values(ascending=False).dropna().index[0]
col1, col2 = top3
tmp = [col1, col2]
third = corr.loc[tmp[0]].drop(tmp).sort_values(ascending=False).index[0]
triple = sorted([tmp[0], tmp[1], third])

pca = PCA(n_components=3)
pca.fit(X_train[triple])

expl_pc1 = round(pca.explained_variance_ratio_[0], 3)

X_train_pca = X_train.copy()
X_test_pca = X_test.copy()

X_train_pca['Lengths'] = pca.transform(X_train[triple])[:, 0]
X_test_pca['Lengths'] = pca.transform(X_test[triple])[:, 0]

X_train_pca = X_train_pca.drop(columns=triple)
X_test_pca = X_test_pca.drop(columns=triple)

model_base = LinearRegression().fit(X_train_pca.drop('Species', axis=1), y_train)
r2_len = round(r2_score(y_test, model_base.predict(X_test_pca.drop('Species', axis=1))), 3)

X_train_cube = X_train.copy()
X_test_cube = X_test.copy()

for col in ['Width', 'Height', 'Length1', 'Length2', 'Length3']:
    if col in X_train_cube.columns:
        X_train_cube[col] = X_train_cube[col] ** 3
        X_test_cube[col] = X_test_cube[col] ** 3

mean_width_cube = round(X_train_cube['Width'].mean(), 3)

dropped = ['Length1', 'Length2', 'Length3']

X_train_cube = X_train_cube.drop(dropped, axis=1)
X_test_cube = X_test_cube.drop(dropped, axis=1)

X_train_cube = pd.get_dummies(X_train_cube, drop_first=False)
X_test_cube = pd.get_dummies(X_test_cube, drop_first=False)

model_cube = LinearRegression().fit(X_train_cube, y_train)
r2_cube = round(r2_score(y_test, model_cube.predict(X_test_cube)), 3)

df2 = df.drop(columns=['Length1', 'Length2', 'Length3'])
X = df2.drop('Weight', axis=1)
y = df2['Weight']

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y, test_size=0.2, random_state=85, stratify=X['Species']
)

X_train2_oh = pd.get_dummies(X_train2, drop_first=False)
X_test2_oh = pd.get_dummies(X_test2, drop_first=False)

model_cat = LinearRegression().fit(X_train2_oh, y_train2)
r2_cat = round(r2_score(y_test2, model_cat.predict(X_test2_oh)), 3)

X_train2_oh2 = pd.get_dummies(X_train2, drop_first=True)
X_test2_oh2 = pd.get_dummies(X_test2, drop_first=True)

model_cat2 = LinearRegression().fit(X_train2_oh2, y_train2)
r2_cat2 = round(r2_score(y_test2, model_cat2.predict(X_test2_oh2)), 3)

(
    mean_width,
    triple,
    expl_pc1,
    r2_len,
    mean_width_cube,
    r2_cube,
    r2_cat,
    r2_cat2
)

(np.float64(4.555),
 ['Length1', 'Length2', 'Length3'],
 np.float64(0.996),
 0.825,
 np.float64(130.894),
 0.921,
 0.853,
 0.853)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

df = pd.read_csv('fish_train.csv')

X = df.drop(columns=['Weight', 'Species'])
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=85, stratify=df['Species']
)

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = round(r2_score(y_test, y_pred), 3)

r2


0.833

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

df = pd.read_csv("fish_train.csv")

X = df.drop(columns=['Weight'])
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=85, stratify=X['Species']
)

numeric = X_train.drop(columns=['Species'])
corr = numeric.corr().abs()
mask = np.triu(np.ones_like(corr, dtype=bool))
top = corr.where(~mask).unstack().sort_values(ascending=False).dropna().index[0]
c1, c2 = top
c3 = corr[c1].drop([c1, c2]).sort_values(ascending=False).index[0]
cols = [c1, c2, c3]

pca = PCA(n_components=3).fit(X_train[cols])

X_train2 = X_train.copy()
X_test2 = X_test.copy()

X_train2["Lengths"] = pca.transform(X_train[cols])[:, 0]
X_test2["Lengths"] = pca.transform(X_test[cols])[:, 0]

X_train2 = X_train2.drop(columns=cols)
X_test2 = X_test2.drop(columns=cols)

cube_cols = ["Width", "Height", "Lengths"]
for c in cube_cols:
    X_train2[c] = X_train2[c] ** 3
    X_test2[c] = X_test2[c] ** 3

model = LinearRegression().fit(X_train2.drop(columns=['Species']), y_train)
r2_cube = round(r2_score(y_test, model.predict(X_test2.drop(columns=['Species']))), 3)
r2_cube


0.932

In [8]:
X_train_oh = pd.get_dummies(X_train2, drop_first=False)
X_test_oh = pd.get_dummies(X_test2, drop_first=False)

model_oh = LinearRegression().fit(X_train_oh, y_train)
r2_oh = round(r2_score(y_test, model_oh.predict(X_test_oh)), 3)
r2_oh


0.951

In [9]:
X_train_oh2 = pd.get_dummies(X_train2, drop_first=True)
X_test_oh2 = pd.get_dummies(X_test2, drop_first=True)

model_oh2 = LinearRegression().fit(X_train_oh2, y_train)
r2_oh2 = round(r2_score(y_test, model_oh2.predict(X_test_oh2)), 3)
r2_oh2


0.951