In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users')


In [3]:
df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
df.shape
print(f'The rows represent {df.shape[0]} observations, and the columns represent {df.shape[1]-1} features and 1 target variable.')

The rows represent 4177 observations, and the columns represent 8 features and 1 target variable.


In [5]:
print(f'There are {df.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [7]:
df.describe(include="number")

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [8]:
X = df.drop('rings', axis=1 )
y = df['rings']
X.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [9]:
y.head()

0    15
1     7
2     9
3    10
4     7
Name: rings, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
X_train.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
3823,F,0.615,0.455,0.135,1.059,0.4735,0.263,0.274
3956,F,0.515,0.395,0.14,0.686,0.281,0.1255,0.22
3623,M,0.66,0.53,0.175,1.583,0.7395,0.3505,0.405
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
2183,M,0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35


In [12]:
y_train.head()

3823     9
3956    12
3623    10
0       15
2183     6
Name: rings, dtype: int64

In [13]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [14]:
scaler = StandardScaler()

In [15]:
ohe = OneHotEncoder(handle_unknown='ignore')

In [16]:
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)


In [17]:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')


In [18]:
col_transformer.fit(X_train)


In [19]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)


In [20]:
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.51404,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.81891,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.59398,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.35002,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0


In [21]:
X_train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3132 entries, 0 to 3131
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3132 non-null   float64
 1   1       3132 non-null   float64
 2   2       3132 non-null   float64
 3   3       3132 non-null   float64
 4   4       3132 non-null   float64
 5   5       3132 non-null   float64
 6   6       3132 non-null   float64
 7   7       3132 non-null   float64
 8   8       3132 non-null   float64
 9   9       3132 non-null   float64
dtypes: float64(10)
memory usage: 244.8 KB


In [22]:
X_train_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,-4.072236e-16,2.291342e-16,1.882984e-16,-3.176117e-17,1.542685e-16,-1.724178e-16,2.268655e-17,0.316411,0.316731,0.366858
std,1.00016,1.00016,1.00016,1.00016,1.00016,1.00016,1.00016,0.46515,0.465276,0.482024
min,-3.784249,-3.594976,-3.283875,-1.69368,-1.620076,-1.657688,-1.711644,0.0,0.0,0.0
25%,-0.6359576,-0.6013146,-0.587759,-0.7941551,-0.7858616,-0.8068136,-0.7907922,0.0,0.0,0.0
50%,0.1616096,0.1597857,0.1155756,-0.06049265,-0.1018954,-0.09089006,-0.03834537,0.0,0.0,0.0
75%,0.7492908,0.7179259,0.5844653,0.65638,0.6471037,0.6707794,0.6424399,1.0,1.0,1.0
max,2.42838,2.443087,23.20839,4.052494,5.049155,5.291117,5.479598,1.0,1.0,1.0


In [23]:
X_train.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,0.52575,0.409254,0.14007,0.834224,0.362219,0.181684,0.240351
std,0.119131,0.098557,0.042661,0.491449,0.223,0.109317,0.139567
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015
25%,0.45,0.35,0.115,0.444,0.187,0.0935,0.13
50%,0.545,0.425,0.145,0.8045,0.3395,0.17175,0.235
75%,0.615,0.48,0.165,1.15675,0.5065,0.255,0.33
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005
