# Setup Environment

In [8]:
%reload_ext autoreload
%autoreload 2

from src.helper_visualization import *

import sys
sys.path.append('../src/')

from config import *

In [3]:
import pandas as pd
import numpy as np

import optuna

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder

# Load Data

In [9]:
DATA_FOLDER_PATH_PROCESSED

'../data/processed'

In [4]:
excel_file = f'{DATA_FOLDER_PATH_PROCESSED}/data_cleaned.xlsx'
df_cleaned = pd.read_excel(excel_file)
hist_by_labels(df_cleaned, 'Length', log=False)

NameError: name 'DATA_FOLDER_PATH_PROCESSED' is not defined

In [None]:
mask = (df_cleaned['Length'] > 3) & (df_cleaned['Length'] < 25)
df_cleaned = df_cleaned[mask]
hist_by_labels(df_cleaned, 'Length')

In [None]:
hist_by_labels(df_cleaned, 'Product Name', top=25, log=True)

In [None]:
product_counts = df_cleaned['Product Name'].value_counts()
cutoff = 450
product_top = product_counts.index[product_counts >= cutoff]
product_others = product_counts.index[product_counts < cutoff]

mask_others = df_cleaned['Product Name'].isin(product_others)
df_cleaned.loc[mask_others,'Product Name']='Others'
hist_by_labels(df_cleaned, 'Product Name', log=True)

In [None]:
# verctorize the data
vectorizer = CountVectorizer(max_features=20000, analyzer='word', ngram_range=(1, 2))
vectorizer.fit(df_cleaned['Title_Cleaned'])

# split the data to train and test
percent_to_select = 0.15
num_rows_to_select = int(len(df_cleaned) * percent_to_select)

# Randomly select 10% of the records
df_test = df_cleaned.sample(n=num_rows_to_select, random_state=42)  # Adjust 'random_state' as needed for reproducibility

# Create the other DataFrame containing the rest of the records
df_train = df_cleaned.drop(df_test.index)

# Train Petrel_or_Not model

In [None]:
product_counts = df_train['Product Name'].value_counts()
product_others = product_counts.index[product_counts < 10000]

mask_others = df_train['Product Name'].isin(product_others)
df_petrel_or_not = df_train.copy()
df_petrel_or_not.loc[mask_others,'Product Name']='Others'

hist_by_labels(df_petrel_or_not, 'Product Name', log=False, horizontal=False)

In [None]:
#Balance the dataset by reducing the records from Petrel
import random

# Calculate the number of 'abc' records to delete (2/3 of the total 'abc' records)
others_records = df_petrel_or_not[df_petrel_or_not['Product Name'] == 'Others']
records_to_delete = 30000

# Randomly select the indices of 'abc' records to delete
indices_to_delete = random.sample(others_records.index.tolist(), records_to_delete)

# Delete the selected rows from the DataFrame
df_petrel_or_not = df_petrel_or_not.drop(indices_to_delete)
hist_by_labels(df_petrel_or_not, 'Product Name', log=False, horizontal=False)

In [None]:
# Step 1: Data Preparation
# Split the data into features (X) and target labels (y)
X = df_petrel_or_not['Title_Cleaned']
y = df_petrel_or_not['Product Name']

# # Step 2: Word Embeddings with Count Vectorizer (FastText-like)
X_encoded = vectorizer.transform(X)

# Step 3: Model Training (Logistic Regression)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

model_petrel_or_not = SGDClassifier(
    max_iter=5000
)
model_petrel_or_not.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred_petrel = model_petrel_or_not.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_petrel)
print(f"Accuracy: {accuracy:.3f}")

# Optionally, you can generate a classification report for more detailed metrics
print(classification_report(y_val, y_pred_petrel, digits=3))

# Train the_Rest Model

In [None]:
df_rests = df_train.copy()

#Balance the dataset by reducing the records from Petrel
import random

# Calculate the number of 'Petrel' records to delete some records
petrel_records = df_rests[df_rests['Product Name'] == 'Petrel']
records_to_delete = len(petrel_records)//10*8*0
indices_to_delete = random.sample(petrel_records.index.tolist(), records_to_delete)
df_rests = df_rests.drop(indices_to_delete)

hist_by_labels(df_rests, 'Product Name', log=False)

In [None]:
# Step 1: Data Preparation
# Split the data into features (X) and target labels (y)
X = df_rests['Title_Cleaned']
y = df_rests['Product Name']

# Step 2: Word Embeddings with Count Vectorizer (FastText-like)
X_encoded = vectorizer.transform(X)

# Step 3: Model Training (Logistic Regression)
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

# weights = df_rests['ProductName'].value_counts().apply(np.sqrt).to_dict()
model_rests_baseline = SGDClassifier(
    max_iter=5000,
)
model_rests_baseline.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred_others = model_rests_baseline.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_others)
print(f"Accuracy: {accuracy:.3f}")

# Optionally, you can generate a classification report for more detailed metrics
print(classification_report(y_val, y_pred_others, digits=3))

In [None]:
# Define an objective function to optimize
def objective(trial):

    # Define hyperparameter search space

    pamams = {
        'loss': trial.suggest_categorical('loss', ['hinge', 'log_loss', 'modified_huber']),
        'alpha': trial.suggest_float('alpha', 1e-6, 1e+2, log=True),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'max_iter': 5000,
        'random_state': 42
    }
    
    # Create and train the SGD Classifier with suggested hyperparameters
    sgd_classifier = SGDClassifier(**pamams)

    # Evaluate the model on the validation set
    scores = cross_val_score(sgd_classifier, X_train, y_train, cv=5, scoring='accuracy')

    return np.mean(scores)

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(
    objective,
    n_trials=50, 
    n_jobs=-1, 
    show_progress_bar=True
    )  # You can adjust the number of trials

# Print the best hyperparameters and corresponding accuracy
best_params = study.best_params
best_accuracy = study.best_value
# Create and train the best Logistic Regression model
model_rests_best = SGDClassifier(**best_params)
model_rests_best.fit(X_train, y_train)
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

In [None]:
# Predict categories for the test data
y_pred = model_rests_best.predict(X_val)
print(classification_report(y_val, y_pred, digits=3))

# Assemble Model

In [None]:
predictions_2L(['Others', 'Others'], ['Petrotechnical Suite - Domain Profiles', 'Techlog'])

In [None]:
i  = 10
y_pred_petrel[i], y_pred_others[i], y_pred_2L[i]

In [None]:
def predictions_2L(predictions_petrel, predictions_others):
    predictions = predictions_others.copy()
    for i in range(len(predictions)):
        if predictions_petrel[i] == 'Petrel':
            predictions[i] = 'Petrel'
    return predictions

X = df_test['Title_Cleaned']
y_test = df_test['Product Name']

X_encoded = vectorizer.transform(X)
y_pred_petrel = model_petrel_or_not.predict(X_encoded)

model_rests = model_rests_baseline
model_rests = model_rests_best
y_pred_rests = model_rests.predict(X_encoded)

y_pred_2L = predictions_2L(y_pred_petrel, y_pred_rests)
accuracy = accuracy_score(y_test, y_pred_2L)
print(f"Accuracy: {accuracy:.3f}")

print(classification_report(y_test, y_pred_2L, digits=3))