<a href="https://colab.research.google.com/github/Mix1996/Prediction-of-Product-Sales/blob/main/Abalone_Pre_Preprocessing_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.pipeline import make_pipeline

from sklearn import set_config
set_config(display='diagram')

In [7]:
# Set the default transformation output to pandas
set_config(display='pandas')

# Load data
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users')

# Display summary info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [8]:
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [9]:
# Make a copy of original df to avoid any manipulations
abalone = df.copy()

Check for Duplicated, Missing, or Erroneous Data

In [10]:
# Check to see if there are any duplicate rows
abalone.duplicated().sum()

0

In [11]:
# Display the sum of missing values
abalone.isna().sum().sum()

0

In [12]:
# Display desriptive statitistics for all collumns
abalone.describe(include='number')

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [13]:
# Display desriptive statitistics for all collumns
abalone.describe(include='object')

Unnamed: 0,sex
count,4177
unique,3
top,M
freq,1528


In [16]:
# Inspect unique values in 'sex' column
print("Unique values in 'sex' column:", abalone['sex'].unique())

Unique values in 'sex' column: ['M' 'F' 'I']


In [17]:
# Column for duplicate rows and address them, if needed
duplicates = abalone[abalone.duplicated()]
print("Duplicate Rows:")
print(duplicates)

Duplicate Rows:
Empty DataFrame
Columns: [sex, length, diameter, height, whole_weight, shucked_weight, viscera_weight, shell_weight, rings]
Index: []


In [19]:
# Check for impossible numeric values and fix them if needed
# Suppose 'height' column should not have negative values
impossible_height = abalone[abalone['height'] < 0]
print("Impossible height values:")
print(impossible_height)

Impossible height values:
Empty DataFrame
Columns: [sex, length, diameter, height, whole_weight, shucked_weight, viscera_weight, shell_weight, rings]
Index: []


In [20]:
# Separate data into X (input features) and y (target variable)
X = abalone.drop(columns=['rings'])  # Exclude 'rings' column
y = abalone['rings']


In [21]:
# Train/test split with a fixed random seed (42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Lists of column names for numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [26]:
# Create transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


In [27]:
# Create tuples for each transformer
numeric_tuple = ('numeric', numeric_transformer, numeric_cols)
categorical_tuple = ('categorical', categorical_transformer, categorical_cols)


In [28]:
# Create ColumnTransformer
preprocessor = ColumnTransformer(transformers=[numeric_tuple, categorical_tuple], verbose_feature_names_out=False)

# Fit and transform on training data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_preprocessed = preprocessor.transform(X_test)

# Display basic statistics of the preprocessed DataFrames
print("X_train_preprocessed shape:", X_train_preprocessed.shape)
print("X_test_preprocessed shape:", X_test_preprocessed.shape)

X_train_preprocessed shape: (3341, 10)
X_test_preprocessed shape: (836, 10)


In [29]:
# Fit the ColumnTransformer on training data
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                 ['length', 'diameter', 'height',
                                  'whole_weight', 'shucked_weight',
                                  'viscera_weight', 'shell_weight']),
                                ('categorical',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse_output=False),
                                 ['sex'])],
                  verbose_feature_names_out=False)

In [30]:
# Transform the training and test data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [36]:
# Check transformed training data
numeric_transformed_cols = preprocessor.transformers_[0][2]
categorical_transformed_cols = preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)
transformed_cols = numeric_transformed_cols + categorical_transformed_cols.tolist()

X_train_processed_df = pd.DataFrame(X_train_processed, columns=transformed_cols)



In [38]:
# Display data types and head of transformed training data
print("Data Types of Transformed Training Data:")
print(X_train_processed_df.dtypes)



Data Types of Transformed Training Data:
length            float64
diameter          float64
height            float64
whole_weight      float64
shucked_weight    float64
viscera_weight    float64
shell_weight      float64
sex_F             float64
sex_I             float64
sex_M             float64
dtype: object


In [39]:
# Display scaled numeric features of the transformed training data
print("Scaled Numeric Features of Transformed Training Data:")
print(X_train_processed_df[numeric_cols].describe())

Scaled Numeric Features of Transformed Training Data:
             length      diameter        height  whole_weight  shucked_weight  \
count  3.341000e+03  3.341000e+03  3.341000e+03  3.341000e+03    3.341000e+03   
mean   5.635852e-16  2.041667e-16 -3.105036e-16 -1.063368e-17    2.530817e-16   
std    1.000150e+00  1.000150e+00  1.000150e+00  1.000150e+00    1.000150e+00   
min   -3.777412e+00 -3.582758e+00 -3.288610e+00 -1.686311e+00   -1.612494e+00   
25%   -6.293175e-01 -5.930591e-01 -5.832029e-01 -7.890755e-01   -7.805961e-01   
50%    1.681999e-01  1.670339e-01  4.928971e-03 -5.867303e-02   -1.056604e-01   
75%    7.558443e-01  7.244354e-01  5.930609e-01  6.513840e-01    6.410292e-01   
max    2.434828e+00  2.447313e+00  2.329495e+01  4.058233e+00    5.056140e+00   

       viscera_weight  shell_weight  
count    3.341000e+03  3.341000e+03  
mean     1.680122e-16  1.828994e-16  
std      1.000150e+00  1.000150e+00  
min     -1.647965e+00 -1.702271e+00  
25%     -8.026546e-01 -7.8