In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [6]:
def preprocess_data_others(df):
    df = df[['SEAMAN NAME', 'SEAMAN CODE', 'SEAFARER CODE', 'RANK', 'VESSEL', 'UMUR', 'CERTIFICATE']]
    df['UMUR'] = df['UMUR'].apply(lambda x: int(str(x).split()[0]))
    
    le_rank = LabelEncoder()
    le_vessel = LabelEncoder()
    le_certificate = LabelEncoder()
    
    df['RANK_ENCODED'] = le_rank.fit_transform(df['RANK'])
    df['VESSEL_ENCODED'] = le_vessel.fit_transform(df['VESSEL'])
    df['CERTIFICATE_ENCODED'] = le_certificate.fit_transform(df['CERTIFICATE'])
    
    df_encoded = df[['RANK_ENCODED', 'VESSEL_ENCODED', 'UMUR', 'CERTIFICATE_ENCODED']]
    
    return df, df_encoded, le_rank, le_vessel, le_certificate

def search_candidates_others(rank, vessel, certificate, age_range, df_original, df_encoded, le_rank, le_vessel, le_certificate):
    df_filtered_indices = df_encoded[(df_encoded['UMUR'] >= age_range[0]) & (df_encoded['UMUR'] <= age_range[1])].index
    
    if df_filtered_indices.empty:
        return "No candidates found within the specified UMUR range."
    
    input_encoded = np.array([
        le_rank.transform([rank])[0],
        le_vessel.transform([vessel])[0],
        np.mean(age_range), 
        le_certificate.transform([certificate])[0]
    ]).reshape(1, -1)
    
    similarity_scores = cosine_similarity(input_encoded, df_encoded.loc[df_filtered_indices])

    df_filtered = df_original.loc[df_filtered_indices].copy()
    df_filtered['Similarity'] = similarity_scores[0]
    df_filtered['Rank_Priority'] = df_filtered['RANK'] == rank
    df_filtered['Certificate_Priority'] = df_filtered['CERTIFICATE'] == certificate
    df_filtered['Vessel_Priority'] = df_filtered['VESSEL'] == vessel
    
    df_filtered = df_filtered.sort_values(
        by=['Rank_Priority', 'Certificate_Priority', 'Vessel_Priority', 'Similarity'],
        ascending=[False, False, False, False]
    )
    
    result = df_filtered[['SEAMAN NAME', 'SEAMAN CODE', 'SEAFARER CODE', 'RANK', 'VESSEL', 'UMUR', 'CERTIFICATE', 'Similarity']].head(5)
    
    return result


file_path = './data/Seamen Report.xlsx'
xls = pd.ExcelFile(file_path)

df_others = pd.read_excel(xls, sheet_name='others')
df_others_original, df_others_encoded, le_rank_others, le_vessel_others, le_certificate_others = preprocess_data_others(df_others)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['UMUR'] = df['UMUR'].apply(lambda x: int(str(x).split()[0]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RANK_ENCODED'] = le_rank.fit_transform(df['RANK'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['VESSEL_ENCODED'] = le_vessel.fit_transform(df['VESSEL'])
A value is trying to be s

In [7]:
result_others = search_candidates_others(
    rank="ELECTRICIAN",
    vessel="DARAT",
    certificate="BASIC SAFETY TRAINING",
    age_range=(35, 50),
    df_original=df_others_original,
    df_encoded=df_others_encoded,
    le_rank=le_rank_others,
    le_vessel=le_vessel_others,
    le_certificate=le_certificate_others
)

# Print the result
print(result_others)

          SEAMAN NAME  SEAMAN CODE  SEAFARER CODE          RANK  \
167           LASIAJI     20120121     6201643820   ELECTRICIAN   
166    ARIS MARDIYONO     20180375     6200565338   ELECTRICIAN   
297  SUNU TRI NUGROHO     20140151     6201017885   ELECTRICIAN   
95        ANDY WIJAYA     20230238     6200409892        FITTER   
193            SARANA     20170038     6201398737  JURU MASAK I   

                                VESSEL  UMUR            CERTIFICATE  \
167        PENDING GAJI (Pending Cuti)    44  BASIC SAFETY TRAINING   
166        PENDING GAJI (Pending Cuti)    50                    ETO   
297  PENDING GAJI (Pending Gaji Biasa)    43                    ETO   
95                 DARAT (Darat Biasa)    46  BASIC SAFETY TRAINING   
193        PENDING GAJI (Pending Cuti)    44  BASIC SAFETY TRAINING   

     Similarity  
167    0.997804  
166    0.998102  
297    0.995890  
95     0.999358  
193    0.996850  
