# Generate Different Dataset with different IR 

In [None]:
import pandas as pd

A file has been added to `data/raw` with the name `unclean_transactions.csv`.  This file contains an export of the transactions table from the database.  We'll use this for our exploratory data analysis and transformation steps.

In [None]:
def extract_cleaned_imb_dataset(path) -> pd.DataFrame:
    imb_data = pd.read_csv(path)
    imb_data["Class"] = imb_data["Class"].astype('string')
    return imb_data



clean_data_path = '../data/processed/cleaned_abalone.csv'
cleaned_abalone_df = extract_cleaned_imb_dataset(clean_data_path)
cleaned_abalone_df.info()
cleaned_abalone_df.head(10)


In [None]:

# Count rows where Name == 'Alice'
count_minority = (cleaned_abalone_df['Class'] == 'P').sum()
count_majority = (cleaned_abalone_df['Class'] == 'N').sum()
ratio = ((count_minority/count_majority))
print("Ration of minority/majority:", ratio)

# Alternative using .shape[0]
#count_alice_alt = df[df['Name'] == 'Alice'].shape[0]
#print("Alternative count:", count_alice_alt)


In [None]:
from imblearn.over_sampling import BorderlineSMOTE
#import pandas as pd

def resample_with_borderline_smote(X, y, target_minority_ratio, kind='borderline-1', random_state=42):
    
    # Ensure y is Series
    y = pd.Series(y, name=y.name or "target")
    # Apply BorderlineSMOTE
    smote = BorderlineSMOTE(kind=kind, sampling_strategy=target_minority_ratio, random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Merge into a single DataFrame
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[y.name] = y_resampled
    return df_resampled



# generate imbalance dataset versions

def generate_imb_data_version(features, target, ratio_values: list[float])-> dict[str, pd.DataFrame]:
    imb_datasets_dic = {}
    for imb_ratio in ratio_values:
        dataset_version = resample_with_borderline_smote(features, target, imb_ratio)
        imb_datasets_dic[f"abalone_df_{int(imb_ratio * 100)}"] = dataset_version.copy()
    return imb_datasets_dic




In [None]:
def write_data_versions_to_csv(data_dic: dict[str, pd.DataFrame]):
    for key, value in data_dic.items():
        value.to_csv(f"../data/data_versions/{key}.csv", index=False)
        


In [None]:
target_min_ratio = [0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,
                    0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1.0]
clean_data_path = '../data/processed/cleaned_abalone.csv'
cleaned_abalone_df  = extract_cleaned_imb_dataset(clean_data_path)
X = cleaned_abalone_df.drop(columns=["Class"])  # predictors
y = cleaned_abalone_df["Class"] 

abalone_versions_dic = generate_imb_data_version(X, y, target_min_ratio)

abalone_versions_dic["abalone_df_10"].info()


In [None]:
write_data_versions_to_csv(abalone_versions_dic)

In [120]:
def read_abalone_csv(file_path) -> dict[str, pd.DataFrame]:
    abalone_df_dic = { }
    
    for i in range(5, 101, 5):
        full_path = f"{file_path}abalone_df_{i}.csv"
        print(full_path)
        abalone_df = pd.read_csv(full_path)
        abalone_df["Class"] = abalone_df["Class"].astype('string')
        abalone_df_dic[f"abalone_df_{i}"] = abalone_df.copy()
        
    return abalone_df_dic
        
        
        
        

In [121]:
my_path = "../data/data_versions/"
my_abalone_dic = read_abalone_csv(my_path)

../data/data_versions/abalone_df_5.csv
../data/data_versions/abalone_df_10.csv
../data/data_versions/abalone_df_15.csv
../data/data_versions/abalone_df_20.csv
../data/data_versions/abalone_df_25.csv
../data/data_versions/abalone_df_30.csv
../data/data_versions/abalone_df_35.csv
../data/data_versions/abalone_df_40.csv
../data/data_versions/abalone_df_45.csv
../data/data_versions/abalone_df_50.csv
../data/data_versions/abalone_df_55.csv
../data/data_versions/abalone_df_60.csv
../data/data_versions/abalone_df_65.csv
../data/data_versions/abalone_df_70.csv
../data/data_versions/abalone_df_75.csv
../data/data_versions/abalone_df_80.csv
../data/data_versions/abalone_df_85.csv
../data/data_versions/abalone_df_90.csv
../data/data_versions/abalone_df_95.csv
../data/data_versions/abalone_df_100.csv


In [122]:
myab = my_abalone_dic["abalone_df_10"]
myab.info()
myab.head(10)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4556 entries, 0 to 4555
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0    Length          4556 non-null   float64
 1    Diameter        4556 non-null   float64
 2    Height          4556 non-null   float64
 3    Whole_weight    4556 non-null   float64
 4    Shucked_weight  4556 non-null   float64
 5    Viscera_weight  4556 non-null   float64
 6    Shell_weight    4556 non-null   float64
 7   Sex_F            4556 non-null   float64
 8   Sex_I            4556 non-null   float64
 9   Sex_M            4556 non-null   float64
 10  Class            4556 non-null   string 
dtypes: float64(10), string(1)
memory usage: 391.7 KB


Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M,Class
0,-0.575179,-0.432763,-1.064838,-0.642383,-0.608183,-0.726599,-0.63865,0.0,0.0,1.0,N
1,-1.450045,-1.441081,-1.184418,-1.231151,-1.171896,-1.205783,-1.213668,0.0,0.0,1.0,N
2,0.049725,0.121812,-0.108206,-0.309734,-0.463873,-0.356942,-0.207386,1.0,0.0,0.0,N
3,-0.70016,-0.432763,-0.347364,-0.638301,-0.64877,-0.607943,-0.602711,0.0,0.0,1.0,N
4,-1.616686,-1.541913,-1.423576,-1.272987,-1.216993,-1.287929,-1.321484,0.0,1.0,0.0,N
5,-0.825141,-1.08817,-1.064838,-0.974011,-0.984743,-0.941091,-0.854282,0.0,1.0,0.0,N
6,0.049725,0.071396,0.250532,-0.104634,-0.551812,-0.356942,0.655142,1.0,0.0,0.0,N
7,0.174706,0.172228,-0.347364,-0.124022,-0.294759,-0.283923,0.152001,1.0,0.0,0.0,N
8,-0.408538,-0.382347,-0.347364,-0.651566,-0.644261,-0.621634,-0.530834,0.0,0.0,1.0,N
9,0.216366,0.323476,0.250532,0.134139,-0.20231,-0.270232,0.583264,1.0,0.0,0.0,P


In [None]:
joined_df = pd.concat(abalone_versions_dic.values(), axis=0)



In [None]:
joined_df.info()
rows =joined_df.shape[0]
print(f"Total Rows: {rows}")

In [None]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# Load environment variables from .env
load_dotenv(dotenv_path="../.env.dev")
# Read DB credentials from .env
username = os.getenv("SOURCE_DB_USER")
password = os.getenv("SOURCE_DB_PASSWORD")
host = os.getenv("SOURCE_DB_HOST")
port = os.getenv("SOURCE_DB_PORT")
database = os.getenv("SOURCE_DB_NAME")



print(f"My DB Port:{port} DB Name: {database}")
# Create SQLAlchemy engine
db_engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}")

# Write to PostgreSQL
joined_df.to_sql("abaloneTable", db_engine, if_exists="replace", index=False)

print("✅ Data written to PostgreSQL successfully!")


In [None]:
abalone_df_10.info()
abalone_df_10.head()

In [None]:
name = "abalone_df_50"
all_df[name].shape[0]

In [None]:
#import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    roc_auc_score, cohen_kappa_score, f1_score, precision_score, recall_score
)
abalone_df = all_df["abalone_df_50"]
X = abalone_df.drop(columns=["Class"])
y = abalone_df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# SVM model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

svm_y_pred = svm_model.predict(X_test)
svm_y_prob = svm_model.predict_proba(X_test)[:, 1]  # probability for positive class

# Random forest
rf_model = RandomForestClassifier(
    n_estimators=200,     # number of trees
    max_depth=None,       # can tune if overfitting
    random_state=42,
    class_weight="balanced"   # helps with imbalanced data
)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)
rf_y_prob = rf_model.predict_proba(X_test)[:, 1] if len(y.unique()) == 2 else None

# KNN 
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # Euclidean distance
knn_model.fit(X_train, y_train)

knn_y_pred = knn_model.predict(X_test)
knn_y_prob = knn_model.predict_proba(X_test)[:, 1] if len(y.unique()) == 2 else None

svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred, average='weighted')
svm_precision = precision_score(y_test, svm_y_pred, average='weighted')
svm_recall = recall_score(y_test, svm_y_pred, average='weighted')
svm_kappa = cohen_kappa_score(y_test, svm_y_pred)
svm_auc = roc_auc_score(y_test, svm_y_prob)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred, average="weighted")
rf_precision = precision_score(y_test, rf_y_pred, average="weighted")
rf_recall = recall_score(y_test, rf_y_pred, average="weighted")
rf_kappa = cohen_kappa_score(y_test, rf_y_pred)
rf_auc = roc_auc_score(y_test, rf_y_prob)

knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred, average="weighted")
knn_precision = precision_score(y_test, knn_y_pred, average="weighted")
knn_recall = recall_score(y_test, knn_y_pred, average="weighted")
knn_kappa = cohen_kappa_score(y_test, knn_y_pred)
knn_auc = roc_auc_score(y_test, knn_y_prob)


print(f"SVM Accuracy: {svm_accuracy:.4f} RF Accuracy: {rf_accuracy: .4f} KNN Accuracy: {knn_accuracy: .4f}")
print(f"SVM F1-score: {svm_f1:.4f} RF F1-score: {rf_f1:.4f} KNN F1-score: {knn_f1:.4f}")
print(f"SVM Precision: {svm_precision:.4f} RF Precision: {rf_precision:.4f} KNN Precision: {knn_precision:.4f}")
print(f"SVM Recall: {svm_recall:.4f} RF Recall: {rf_recall:.4f} KNN Recall: {knn_recall:.4f}")
print(f"SVM Cohen's Kappa: {svm_kappa:.4f} RF Cohen's Kappa: {rf_kappa:.4f} KNN Cohen's Kappa: {knn_kappa:.4f}")
print(f"SVM AUC: {svm_auc:.4f} RF AUC: {rf_auc:.4f} KNN AUC: {knn_auc:.4f}")


In [None]:
datasets = []
svm_accur = []
svm_auc = []
for key, value in all_df.items():
    X = value.drop(columns=["Class"])
    y = value["Class"]
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
    svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
    svm_model.fit(X_train, y_train)

    svm_y_pred = svm_model.predict(X_test)
    svm_y_prob = svm_model.predict_proba(X_test)[:, 1]  # probability for positive class
    datasets.append(key)
    svm_accur.append(accuracy_score(y_test, svm_y_pred))
    svm_auc.append(roc_auc_score(y_test, svm_y_prob))
    
    
svm_metrics_df = pd.DataFrame({"name":datasets, "Accuracy": svm_accur, "AUC":svm_auc})
print(svm_metrics_df)

In [None]:
#import pandas as pd
import matplotlib.pyplot as plt



# Plot line chart
plt.figure(figsize=(8, 5))
plt.plot(svm_metrics_df["name"], svm_metrics_df["Accuracy"], marker="o", linestyle="--", linewidth=1)

# Add labels and title
plt.xlabel("name")
plt.ylabel("Accuracy")
plt.xticks(rotation=90)
plt.title("Line Chart Example")

# Show grid & plot
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


In [None]:
#import pandas as pd
#import matplotlib.pyplot as plt

# Example DataFrame
df = pd.DataFrame({
    "Category": ["A", "B", "C", "D", "E"],
    "Line1": [2.3, 3.8, 1.5, 4.2, 3.0],
    "Line2": [1.5, 2.5, 3.2, 2.8, 4.0],
    "Line3": [3.0, 2.0, 4.5, 3.5, 2.2]
})

# Plot multiple lines
plt.figure(figsize=(8, 5))

for col in df.columns[1:]:   # skip 'Category' since it's x-axis
    plt.plot(df["Category"], df[col], marker="o", label=col)

# Add labels, title, legend
plt.xlabel("Category")
plt.ylabel("Value")
plt.title("Multiple Line Charts on Same Plot")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)

plt.show()


In [None]:
# Show how many duplicates there are in the DataFrame
duplicates = transactions.duplicated().sum()
print(f'There are {duplicates} duplicate rows in the transactions DataFrame.')

We now know that there are some duplicate rows in the transactions DataFrame.  We will need to remove these duplicates before we can use the data for analysis or further processing.

In [None]:
# Show the different date formats in the transaction_date column
unique_date_formats = transactions['transaction_date'].unique()
print(f'Different date formats in transaction_date: {unique_date_formats[:20]}')  # Show first 20 unique values

This shows that we will need to standardise the date format in the `transaction_date` column.

In [None]:
# Show the different data types in the amount column
amount_types = transactions['amount'].map(type).unique()
print(f'Different data types in amount column: {amount_types}')

Now we can see that the amount is either a string or a float.  We will need to convert the amount column to a numeric type before we can use it for analysis or further processing.

---
---

## Cleaning the Data

---

### Epic 2 - Story 3 - Task 2 - Handle Missing Values

We are going to remove any incomplete rows from the DataFrame.  This will remove any rows that have missing values in any of the columns.

In [None]:
transactions = transactions.dropna(subset=["transaction_date"])

# remove rows with null values in amount from the transaction dataframe
transactions = transactions.dropna(subset=["amount"])

# See information about the transactions dataframe
transactions.info()

> We have dropped 254 rows from the DataFrame that had missing values in any of the columns.

In [None]:
# Run a check - this operation will be tested in the pipeline!
transactions.isnull().any(axis=1).sum()

---

### Epic 2 - Story 3 - Task 3 - Standardise Date Format

We saw that the `transaction_date` column has a mix of date formats.  We will standardise this to a single format.  You need to examine the date column and then identify and list ALL of the date formats that are present in the column.  You can then use this information to standardise the date format.

In [None]:
# Convert all dates into dd/mm/yyyy format - write function to handle the different types of date formats
def standardise_date(date_str):
    if pd.isna(date_str) or date_str == "":
        return pd.NaT

    formats = [
        "%Y/%m/%d",
        "%Y-%m-%d",
        "%d %b %Y",
        "%b %d, %Y",
        "%d %B %Y",
        "%d-%m-%Y",
        "%d/%m/%Y",
        "%m/%d/%Y",
        "%d/%m/%Y",
    ]
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue

    return pd.NaT


# Apply the parse_date function to the transaction_date column
transactions["transaction_date"] = transactions["transaction_date"].apply(
    standardise_date
)
transactions["transaction_date"] = transactions[
    "transaction_date"
].dt.strftime("%d/%m/%Y")

transactions = transactions.dropna(subset=["transaction_date"])

# Display the DataFrame info
transactions.info()

All dates in the `transaction_date` column are now in the standardised format of `%d/%m/%Y`.

---

### Epic 2 - Story 3 - Task 4 - Convert Amount to Numeric

Find out how many rows can be converted to numeric values in the `amount` column.  This will help us understand how many rows we can use for analysis or further processing.  We specifically want to convert the string to a float and understand how many rows will be converted and how many `NaN` values will be created as a result of this conversion.

In [None]:
# Count convertible vs non-convertible amounts
convertible = (
    pd.to_numeric(transactions["amount"], errors="coerce").notna().sum()
)
print(f"Convertible: {convertible}, NaNs: {len(transactions) - convertible}")

The values that generate the `NaN` values are those that cannot be converted to a float.  This includes any non-numeric characters or strings that do not represent a valid number.  We can check what these are.

In [None]:
# Show original values that can't be converted to numeric
unconvertible = transactions[
    pd.to_numeric(transactions["amount"], errors="coerce").isna()
]["amount"].unique()
print(unconvertible)

This confirms to use that a number of rows in the `amount` column cannot be converted to a float as they contain the string `INVALID`.  We can safely remove these rows from the DataFrame as they will not be useful for analysis or further processing.

In [None]:
# Convert and drop NaNs in one operation
transactions["amount"] = pd.to_numeric(transactions["amount"], errors="coerce")
transactions.dropna(subset=["amount"], inplace=True)

transactions.info()

We have cleaned the `amount` column by converting it to a numeric type and dropping any rows that contain `NaN` values in this column.  This will allow us to use the `amount` column for analysis or further processing.

This concluldes the cleaning of the `transactions` DataFrame.  We have removed any rows with missing values, standardised the date format, removed duplicates, and converted the `amount` column to a numeric type.  The DataFrame is now ready for analysis or further processing.

---

### Epic 2 - Story 3 - Task 5 - Remove Duplicates

Check to see how many duplicates there are now once the data has been cleaned a little.

In [None]:
# Show how many duplicates there are in the DataFrame
duplicates = transactions.duplicated().sum()
print(f"There are {duplicates} duplicate rows in the transactions DataFrame.")

In [None]:
# Drop the duplicates

transactions.drop_duplicates(inplace=True)

transactions.info()

This should have removed the 503 duplicated rows from the DataFrame.

### Reset the indexes

> This was added after the COMPONENT tests for customers data failed due to index conflicts - as we modified the DataFrame, the indexes were no longer sequential.

We can see that the indexes are now out of order, so we will reset them to be sequential again.

The COMPONENT tests for the transactions data set will also need to be updated/added.

In [None]:
transactions.reset_index(drop=True, inplace=True)

---
---

### Epic 2 - Story 3 - Task 6 - Save the Cleaned Data

For testing purposes in the pipeline, it makes sense for us to export the cleaned DataFrame to a CSV file.  This will allow us to use the cleaned data in the pipeline without having to run the cleaning steps again.

In [None]:
transactions.to_csv(
    "../tests/test_data/expected_transactions_clean_results.csv", index=False
)

---

### Epic 2 - Story 3 - Task 7 - Transfer the code from the Jupyter Notebook to a Python script, creating separate functions for each cleaning step

### Epic 2 -Story 3 - Task 8 - Write tests for each cleaning function to ensure they work correctly

### Epic 2 - Story 3 - Task 9 - Create a script to run the cleaning functions in sequence and log the process

### Epic 2 - Story 3 - Task 10 - Add the transaction cleaning script to scripts/run and update any tests accordingly

Jupyter Notebooks do not play nicely with CI/CD pipelines, so we will need to transfer the code from the Jupyter Notebook to a Python script.  We will create separate functions for each cleaning step and then write tests for each function to ensure they work correctly.