# Exploratory Data Analysis of Abalone

### EPIC 1 STORY 1.1

In [None]:
import pandas as pd

In [None]:
# Task 1.1.1: Load the CSV file using Pandas or another library.
abalone_df = pd.read_csv('../data/raw/unclean_abalone.csv')

# Task 1.2.1: Check that all expected columns exist in the CSV.
abalone_df.head()

# Task 1.2.2: Verify data types for each column - numeric, categorical.
abalone_df.info()

# Task 1.2.3: Check for missing or null values.


abalone_df.isnull().sum()

# Task 1.2.3: Handle the object type to string

abalone_df[["Class","Sex"]] = abalone_df[["Class","Sex"]].astype("string")
abalone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4174 entries, 0 to 4173
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4174 non-null   object 
 1    Length          4174 non-null   float64
 2    Diameter        4174 non-null   float64
 3    Height          4174 non-null   float64
 4    Whole_weight    4174 non-null   float64
 5    Shucked_weight  4174 non-null   float64
 6    Viscera_weight  4174 non-null   float64
 7    Shell_weight    4174 non-null   float64
 8   Class            4174 non-null   object 
dtypes: float64(7), object(2)
memory usage: 293.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4174 entries, 0 to 4173
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4174 non-null   string 
 1    Length          4174 non-null   float64
 2    Diameter        4174 non-null   float6

In [6]:
abalone_df.isnull().values.any()
abalone_df.isnull().sum()



Sex                0
 Length            0
 Diameter          0
 Height            0
 Whole_weight      0
 Shucked_weight    0
 Viscera_weight    0
 Shell_weight      0
Class              0
dtype: int64

#### `Class` Column

Lets take a look at the `Class` column's values.  We'll get the first 20 unique values to see if there are any inconsistencies or unexpected values.

In [None]:
# Get the first 20 unique values from the Class column
abalone_df["Class"].unique()[:20]

<StringArray>
['negative', 'positive']
Length: 2, dtype: string

In [10]:
# Check to see if all values are string that are not nan

for value in abalone_df["Class"].unique()[:20]:
    print(f"Value: {repr(value)}, Type: {type(value)}")

Value: 'negative', Type: <class 'str'>
Value: 'positive', Type: <class 'str'>


#### Duplicates

How many duplicates are there in the data?  Let's get a ball-park figure.

In [11]:
# Duplicate rows in current data set
abalone_df.duplicated().sum()

np.int64(0)

In [None]:
# check number or rows in the dataset
print(abalone_df.shape[0])

9


### Encode the Categorical Variables

In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assume Abalone19 is your DataFrame
X = abalone_df.drop(columns=["Class"])  # predictors
y = abalone_df["Class"]                # target

# Select only 'Sex' column to encode
encoder = OneHotEncoder(drop=None, sparse_output=False)
sex_encoded = encoder.fit_transform(X[["Sex"]])

y_encoded = abalone_df['Class'].map({'negative': 'N', 'positive': 'P'})


# Convert to DataFrame with proper column names
sex_encoded_df = pd.DataFrame(sex_encoded, columns=encoder.get_feature_names_out(["Sex"]))

# Drop original 'Sex' column and concatenate encoded columns
X_encoded = pd.concat([X.drop(columns=["Sex"]).reset_index(drop=True),
                       sex_encoded_df.reset_index(drop=True)], axis=1)

# Combine with target column
abalone_df_encoded = pd.concat([X_encoded, y_encoded.reset_index(drop=True)], axis=1)



In [26]:
abalone_df_encoded.head(10)

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M,Class
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0.0,0.0,1.0,N
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0.0,0.0,1.0,N
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1.0,0.0,0.0,N
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0.0,0.0,1.0,N
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0.0,1.0,0.0,N
5,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,0.0,1.0,0.0,N
6,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,1.0,0.0,0.0,N
7,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,1.0,0.0,0.0,N
8,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0.0,0.0,1.0,N
9,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,1.0,0.0,0.0,P


In [36]:
#import pandas as pd
from sklearn.preprocessing import StandardScaler

# Suppose df is your DataFrame and you want to standardize numeric columns
#numeric_cols = ['Length', 'Diameter', 'Height','Whole_weight','Shucked_weight','Viscera_weight','Shell_weight']  # replace with your numeric column names
numeric_cols = abalone_df_encoded.columns.tolist()[0:7]
scaler = StandardScaler()

# Fit and transform the numeric columns
abalone_df_encoded[numeric_cols] = scaler.fit_transform(abalone_df_encoded[numeric_cols])

# Check result
print(abalone_df_encoded.head())


     Length   Diameter    Height   Whole_weight   Shucked_weight  \
0 -0.575179  -0.432763 -1.064838      -0.642383        -0.608183   
1 -1.450045  -1.441081 -1.184418      -1.231151        -1.171896   
2  0.049725   0.121812 -0.108206      -0.309734        -0.463873   
3 -0.700160  -0.432763 -0.347364      -0.638301        -0.648770   
4 -1.616686  -1.541913 -1.423576      -1.272987        -1.216993   

    Viscera_weight   Shell_weight  Sex_F  Sex_I  Sex_M Class  
0        -0.726599      -0.638650    0.0    0.0    1.0     N  
1        -1.205783      -1.213668    0.0    0.0    1.0     N  
2        -0.356942      -0.207386    1.0    0.0    0.0     N  
3        -0.607943      -0.602711    0.0    0.0    1.0     N  
4        -1.287929      -1.321484    0.0    1.0    0.0     N  


In [38]:
abalone_df_encoded.to_csv('../data/processed/cleaned_abalone.csv', index=False)

In [39]:
clean_abalone_df = pd.read_csv('../data/processed/cleaned_abalone.csv')

In [40]:
clean_abalone_df.head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M,Class
0,-0.575179,-0.432763,-1.064838,-0.642383,-0.608183,-0.726599,-0.63865,0.0,0.0,1.0,N
1,-1.450045,-1.441081,-1.184418,-1.231151,-1.171896,-1.205783,-1.213668,0.0,0.0,1.0,N
2,0.049725,0.121812,-0.108206,-0.309734,-0.463873,-0.356942,-0.207386,1.0,0.0,0.0,N
3,-0.70016,-0.432763,-0.347364,-0.638301,-0.64877,-0.607943,-0.602711,0.0,0.0,1.0,N
4,-1.616686,-1.541913,-1.423576,-1.272987,-1.216993,-1.287929,-1.321484,0.0,1.0,0.0,N


In [41]:
clean_abalone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4174 entries, 0 to 4173
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0    Length          4174 non-null   float64
 1    Diameter        4174 non-null   float64
 2    Height          4174 non-null   float64
 3    Whole_weight    4174 non-null   float64
 4    Shucked_weight  4174 non-null   float64
 5    Viscera_weight  4174 non-null   float64
 6    Shell_weight    4174 non-null   float64
 7   Sex_F            4174 non-null   float64
 8   Sex_I            4174 non-null   float64
 9   Sex_M            4174 non-null   float64
 10  Class            4174 non-null   object 
dtypes: float64(10), object(1)
memory usage: 358.8+ KB


### Reset the indexes

> This was added after the previous task as COMPONENT tests failed due to index conflicts - the same will be added for the transaction data set.

We can see that the indexes are now out of order, so we will reset them to be sequential again.

In [None]:
customers.reset_index(drop=True, inplace=True)

---
---

### Epic 2 - Story 4 - Task 6 - Save the Cleaned Data

For testing purposes in the pipeline, it makes sense for us to export the cleaned DataFrame to a CSV file.  This will allow us to use the cleaned data in the pipeline without having to run the cleaning steps again.

In [None]:
customers.to_csv('../tests/test_data/expected_customers_clean_results.csv', index=False)

---

### Epic 2 - Story 4 - Task 7 - Transfer the code from the Jupyter Notebook to a Python script, creating separate functions for each cleaning step

### Epic 2 - Story 4 - Task 8 - Write tests for each cleaning function to ensure they work correctly

### Epic 2 - Story 4 - Task 9 - Create a script to run the cleaning functions in sequence and log the process

### Epic 2 - Story 4 - Task 10 - Add the customer cleaning script to scripts/run and update any tests accordingly

Jupyter Notebooks do not play nicely with CI/CD pipelines, so we will need to transfer the code from the Jupyter Notebook to a Python script.  We will create separate functions for each cleaning step and then write tests for each function to ensure they work correctly.