In [24]:
import pandas as pd 
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector
from numpy import dtype
from sklearn.compose import make_column_transformer

In [25]:
filename = '/content/abalone.data'
header=None
df = pd.read_csv(filename, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [26]:
df = df.rename(columns={0: 'Sex',
                        1: 'Length', 
                        2: 'Diameter',
                        3: 'Height',
                        4: 'Whole Weight',
                        5: 'Shgucked Weight', 
                        6: 'Viscera Weight',
                        7: 'Shell Weight', 
                        8: 'Rings'})

In [27]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shgucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [28]:
#08/02 checking for missing values/duplicates
df.isna().sum()

Sex                0
Length             0
Diameter           0
Height             0
Whole Weight       0
Shgucked Weight    0
Viscera Weight     0
Shell Weight       0
Rings              0
dtype: int64

In [29]:
df.duplicated().sum()

0

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4177 non-null   object 
 1   Length           4177 non-null   float64
 2   Diameter         4177 non-null   float64
 3   Height           4177 non-null   float64
 4   Whole Weight     4177 non-null   float64
 5   Shgucked Weight  4177 non-null   float64
 6   Viscera Weight   4177 non-null   float64
 7   Shell Weight     4177 non-null   float64
 8   Rings            4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [31]:
#separating into X & y 
target = ['Rings']
X = df.drop(columns = target)
y = df[target]


In [32]:
# Train test split
X_train, X_test, Y_train, y_test = train_test_split(X,y,random_state=42)
X_train.shape

(3132, 8)

In [33]:
display(X_train.info())
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3132 entries, 3823 to 860
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              3132 non-null   object 
 1   Length           3132 non-null   float64
 2   Diameter         3132 non-null   float64
 3   Height           3132 non-null   float64
 4   Whole Weight     3132 non-null   float64
 5   Shgucked Weight  3132 non-null   float64
 6   Viscera Weight   3132 non-null   float64
 7   Shell Weight     3132 non-null   float64
dtypes: float64(7), object(1)
memory usage: 220.2+ KB


None

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shgucked Weight,Viscera Weight,Shell Weight
3823,F,0.615,0.455,0.135,1.059,0.4735,0.263,0.274
3956,F,0.515,0.395,0.14,0.686,0.281,0.1255,0.22
3623,M,0.66,0.53,0.175,1.583,0.7395,0.3505,0.405
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
2183,M,0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35


In [34]:

cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include= 'number')
cat_selector(X_train)

['Sex']

In [35]:
cat_selector(X_train)
train_cat_data = X_train[cat_selector(X_train)]

In [36]:
scaler = StandardScaler()
# scaler means scaling the data

In [37]:
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')

In [38]:
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)
# column transformers work based on tuples

In [39]:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')
# remainder : 

In [40]:
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ffa318ca8d0>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7ffa318ca6d0>)])

col_transformer.fit(X_train)

In [41]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [42]:
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.51404,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.81891,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.59398,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.35002,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0


In [43]:
final_features = num_selector(X_train)
final_features

['Length',
 'Diameter',
 'Height',
 'Whole Weight',
 'Shgucked Weight',
 'Viscera Weight',
 'Shell Weight']

In [44]:
#get features name from onehotencoder using name_transformers/ get_features_names

cat_features = list(col_transformer.named_transformers_['onehotencoder'].get_feature_names(cat_selector(X_train)))
cat_features



['Sex_F', 'Sex_I', 'Sex_M']

In [45]:
final_features.extend(cat_features)
final_features

['Length',
 'Diameter',
 'Height',
 'Whole Weight',
 'Shgucked Weight',
 'Viscera Weight',
 'Shell Weight',
 'Sex_F',
 'Sex_I',
 'Sex_M']

In [46]:
X_train_df = pd.DataFrame(X_train_processed, columns=final_features)
X_train_df.head()

Unnamed: 0,Length,Diameter,Height,Whole Weight,Shgucked Weight,Viscera Weight,Shell Weight,Sex_F,Sex_I,Sex_M
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.51404,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.81891,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.59398,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.35002,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0
