In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings

In [19]:
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')

In [20]:
#preprocessing
print("--- Loading and Cleaning Data ---")
try:
    ac_df_original = pd.read_csv('datasetAc(datasetAc).csv')
    env_samples_df = pd.read_csv('OFFC_EMY.csv')
except FileNotFoundError:
    print("Error: Make sure 'datasetAc(datasetAc).csv' and 'OFFC_EMY.csv' are in the same directory.")
    exit()

ac_df = ac_df_original.copy()
if ac_df.columns[-1].startswith('Unnamed'):
    ac_df = ac_df.iloc[:, :-1]
ac_df = ac_df[['AC name', 'cooling capacity BTU/h', 'power input kW']]
ac_df.columns = ['AC_Name', 'BTU_Capacity', 'Power_Input_kW']
ac_df = ac_df.dropna()
ac_df['BTU_Capacity'] = pd.to_numeric(ac_df['BTU_Capacity'], errors='coerce')
ac_df['Power_Input_kW'] = pd.to_numeric(ac_df['Power_Input_kW'], errors='coerce')
ac_df = ac_df.dropna(subset=['BTU_Capacity', 'Power_Input_kW'])
ac_df = ac_df[ac_df['BTU_Capacity'] > 0]
ac_df = ac_df[ac_df['Power_Input_kW'] > 0]
ac_df = ac_df.reset_index(drop=True)
print(f"Cleaned AC data: {ac_df.shape[0]} records")

print("\n--- Engineering Features from AC Name (Physical Type) ---")
def get_ac_prefix(ac_name):
    if isinstance(ac_name, str):
        return ac_name.split('-')[0].upper()
    return "UNKNOWN_PREFIX"
ac_df['AC_Prefix'] = ac_df['AC_Name'].apply(get_ac_prefix)
#print("Extracted AC Prefixes (sample):\n", ac_df['AC_Prefix'].value_counts().head())

prefix_to_physical_type_map = {
    'PLFY': 'Ceiling_Cassette',
    'PKFY': 'Wall_Mounted',
    'PEFY': 'Ceiling_Concealed_Ducted',
    'PCFY': 'Ceiling_Suspended',
    'PFFY': 'Floor_Standing'

}
ac_df['Physical_Type'] = ac_df['AC_Prefix'].map(prefix_to_physical_type_map).fillna('Other_Unknown')
print("\nAC Physical Types Assigned:")
print(ac_df['Physical_Type'].value_counts())

env_df_cleaned = env_samples_df[['Temp', 'Humid']].copy()
env_df_cleaned = env_df_cleaned.dropna()
env_df_cleaned = env_df_cleaned[(env_df_cleaned['Temp'] >= 10) & (env_df_cleaned['Temp'] <= 40)]
env_df_cleaned = env_df_cleaned[(env_df_cleaned['Humid'] >= 0) & (env_df_cleaned['Humid'] <= 100)]
env_df_cleaned = env_df_cleaned.drop_duplicates().reset_index(drop=True)
#print(f"Cleaned environmental samples: {env_df_cleaned.shape[0]} unique Temp/Humid pairs")

--- Loading and Cleaning Data ---
Cleaned AC data: 116 records

--- Engineering Features from AC Name (Physical Type) ---

AC Physical Types Assigned:
Physical_Type
Ceiling_Concealed_Ducted    50
Floor_Standing              24
Ceiling_Cassette            23
Wall_Mounted                 8
Other_Unknown                7
Ceiling_Suspended            4
Name: count, dtype: int64


In [21]:
#BTU calculation
B_BASE = 500; T0_BASELINE_TEMP = 25; H0_BASELINE_HUMID = 50
ALPHA_TEMP_FACTOR = 0.02; BETA_HUMID_FACTOR = 0.01
def calculate_required_btu(temp, humid, volume):
    btu_required = B_BASE * volume * (1 + ALPHA_TEMP_FACTOR * (temp - T0_BASELINE_TEMP)) * \
                   (1 + BETA_HUMID_FACTOR * (humid - H0_BASELINE_HUMID))
    return max(0, btu_required)

In [22]:
#define AC type categories (granular)

def get_capacity_label_for_btu_value(btu_capacity_value):
    if btu_capacity_value < 9000: return 'Small'
    elif btu_capacity_value < 18000: return 'Medium'
    elif btu_capacity_value < 36000: return 'Large'
    else: return 'Very_Large'

def categorize_ac_granular(row):
    capacity_label = get_capacity_label_for_btu_value(row['BTU_Capacity'])
    physical_type = row['Physical_Type']
    return f"{capacity_label}_{physical_type}"
ac_df['AC_Type_Category'] = ac_df.apply(categorize_ac_granular, axis=1)
#print("New Granular AC Categories defined:\n", ac_df['AC_Type_Category'].value_counts())


In [23]:
#generate training dtaa for ML model

simulated_areas = np.arange(10, 51, 5)
simulated_heights = np.arange(2.5, 4.6, 0.5)
training_data_list = []
for _, env_row in env_df_cleaned.iterrows():
    temp = env_row['Temp']; humid = env_row['Humid']
    for area in simulated_areas:
        for height in simulated_heights:
            volume = area * height
            btu_required = calculate_required_btu(temp, humid, volume)

            suitable_acs = ac_df[(ac_df['BTU_Capacity'] >= btu_required) & (ac_df['BTU_Capacity'] <= btu_required * 1.25)]
            optimal_category_candidate = None
            if not suitable_acs.empty:
                best_ac_for_condition = suitable_acs.sort_values(by=['BTU_Capacity', 'Power_Input_kW']).iloc[0]
                optimal_category_candidate = best_ac_for_condition['AC_Type_Category']
            else:
                larger_acs = ac_df[ac_df['BTU_Capacity'] >= btu_required]
                if not larger_acs.empty:
                    best_ac_for_condition = larger_acs.sort_values(by=['BTU_Capacity', 'Power_Input_kW']).iloc[0]
                    optimal_category_candidate = best_ac_for_condition['AC_Type_Category']

            if optimal_category_candidate: optimal_category = optimal_category_candidate
            elif not ac_df.empty: optimal_category = ac_df.sort_values(by='BTU_Capacity', ascending=False).iloc[0]['AC_Type_Category']
            else: optimal_category = "No_AC_Available"
            training_data_list.append({'Temp': temp, 'Humid': humid, 'Area': area, 'Height': height,
                                       'Volume': volume, 'BTU_Required': btu_required,
                                       'Optimal_AC_Type_Category': optimal_category})
training_df = pd.DataFrame(training_data_list)
training_df = training_df.dropna()
#print(f"Generated training data: {training_df.shape[0]} records")
#if not training_df.empty: print(training_df['Optimal_AC_Type_Category'].value_counts())
if training_df.empty or training_df['Optimal_AC_Type_Category'].nunique() < 2:
    print("Error: Not enough training data or distinct categories generated. Review Steps 3 & 4.")
    exit()

In [24]:
#train a clasification model

X = training_df[['Temp', 'Humid', 'Area', 'Height', 'Volume', 'BTU_Required']]
y_categorical = training_df['Optimal_AC_Type_Category']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_categorical)
#print(f"Target categories encoded: {len(label_encoder.classes_)} classes")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train, y_train)

#accuracy & classification report
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))


Model Accuracy: 0.9999
Classification Report:
                                      precision    recall  f1-score   support

             Large_Ceiling_Cassette       1.00      1.00      1.00      6364
     Large_Ceiling_Concealed_Ducted       1.00      1.00      1.00      1550
            Medium_Ceiling_Cassette       1.00      1.00      1.00       871
              Medium_Floor_Standing       1.00      1.00      1.00      1277
             Small_Ceiling_Cassette       1.00      1.00      1.00        23
        Very_Large_Ceiling_Cassette       1.00      1.00      1.00     11818
Very_Large_Ceiling_Concealed_Ducted       1.00      1.00      1.00     10622
          Very_Large_Floor_Standing       1.00      1.00      1.00      1248

                           accuracy                           1.00     33773
                          macro avg       1.00      1.00      1.00     33773
                       weighted avg       1.00      1.00      1.00     33773



In [None]:
#create prediction/recom function (hybrid logic di ppt)

def get_ac_recommendation_hybrid(temp, humid, area, height, #user inputs
                                 trained_model, feature_scaler, category_encoder, ac_data_full,
                                 preferred_physical_type=None): #user preference
    volume = area * height
    btu_required = calculate_required_btu(temp, humid, volume)
    print(f"\nInput: Temp={temp}°C, Humid={humid}%, Area={area}m², Height={height}m, Volume={volume:.2f}m³")
    print(f"Calculated Required BTU: {btu_required:.2f}")


    input_features_ml = np.array([[temp, humid, area, height, volume, btu_required]])
    input_features_ml_scaled = feature_scaler.transform(input_features_ml)



    ml_predicted_category_encoded = trained_model.predict(input_features_ml_scaled)
    ml_predicted_category = category_encoder.inverse_transform(ml_predicted_category_encoded)[0]
    print(f"ML's Initial Suggested AC Type Category: {ml_predicted_category}")

    final_search_category = ml_predicted_category # Default to ML's suggestion

    if preferred_physical_type:
    
        sanitized_preferred_type = preferred_physical_type #contoh output: "Wall_Mounted"

        if sanitized_preferred_type not in ac_data_full['Physical_Type'].unique():
            print(f"Warning: Preferred physical type '{sanitized_preferred_type}' is not recognized or available. Using ML suggestion.")
        else:
            capacity_label_for_btu = get_capacity_label_for_btu_value(btu_required)
            user_defined_category = f"{capacity_label_for_btu}_{sanitized_preferred_type}"
            print(f"User preferred Physical Type: '{sanitized_preferred_type}'. Target category adjusted to: '{user_defined_category}'")
            final_search_category = user_defined_category
    else:
        print(f"No user preference for physical type. Using ML's suggestion: '{ml_predicted_category}'")


    recommended_acs = ac_data_full[
        (ac_data_full['AC_Type_Category'] == final_search_category) &
        (ac_data_full['BTU_Capacity'] >= btu_required) &
        (ac_data_full['BTU_Capacity'] <= btu_required * 1.30) #user preference wider margin
    ]

    #fallback logic kl gaketemu
    if recommended_acs.empty and preferred_physical_type:
        print(f"No ideal ACs found for '{final_search_category}' matching BTU. Searching for any {preferred_physical_type} that meet BTU.")
        recommended_acs = ac_data_full[
            (ac_data_full['Physical_Type'] == preferred_physical_type) & #physical type only
            (ac_data_full['BTU_Capacity'] >= btu_required) &
            (ac_data_full['BTU_Capacity'] <= btu_required * 1.35) #naikin
        ]

    if recommended_acs.empty: #broader fallback if still nothing
        print(f"Still no suitable ACs. Broadening search based on ML's original suggestion or general BTU match...")
        if final_search_category != ml_predicted_category : 
             recommended_acs = ac_data_full[
                (ac_data_full['AC_Type_Category'] == ml_predicted_category) &
                (ac_data_full['BTU_Capacity'] >= btu_required) &
                (ac_data_full['BTU_Capacity'] <= btu_required * 1.30)
            ]
        if recommended_acs.empty: #final general search
            recommended_acs = ac_data_full[
                (ac_data_full['BTU_Capacity'] >= btu_required) &
                (ac_data_full['BTU_Capacity'] <= btu_required * 1.40) #margin plg tinggi (inibs ganti)
            ]


    if not recommended_acs.empty:
        recommended_acs = recommended_acs.sort_values(by=['Power_Input_kW', 'BTU_Capacity'])
        print("\nRecommended AC Models (Top 5):")
        print(recommended_acs[['AC_Name', 'BTU_Capacity', 'Power_Input_kW', 'AC_Type_Category', 'Physical_Type']].head())
    else:
        print("No suitable AC models found for the given requirements, even after broadening search.")
    return final_search_category, recommended_acs




--- Defining Recommendation Function (Hybrid) ---


In [26]:
#save model

print("\n--- Saving Model and Encoders ---")
joblib.dump(rf_classifier, 'ac_type_classifier_model_v4.pkl')
joblib.dump(label_encoder, 'ac_type_label_encoder_v4.pkl')
joblib.dump(scaler, 'feature_scaler_v4.pkl')
print("Model, Label Encoder, and Scaler saved (v4 - Hybrid).")

#hybrid approach
print("\n--- Example Recommendation (Hybrid) ---")
loaded_model = joblib.load('ac_type_classifier_model_v4.pkl')
loaded_label_encoder = joblib.load('ac_type_label_encoder_v4.pkl')
loaded_scaler = joblib.load('feature_scaler_v4.pkl')

#ts 1
print("\n--- Scenario 1: No User Preference for Physical Type ---")
user_temp = 28; user_humid = 70; user_area = 20; user_height = 2.5
final_cat, recs = get_ac_recommendation_hybrid(
    user_temp, user_humid, user_area, user_height,
    loaded_model, loaded_scaler, loaded_label_encoder, ac_df
)

#ts 2 prefer "Wall_Mounted" unit
print("\n--- Scenario 2: User Prefers 'Wall_Mounted' ---")
user_temp = 28; user_humid = 70; user_area = 20; user_height = 2.5
# Ensure the preferred_physical_type matches the sanitized format in ac_df['Physical_Type']
user_preferred_type = 'Wall_Mounted'
final_cat_pref, recs_pref = get_ac_recommendation_hybrid(
    user_temp, user_humid, user_area, user_height,
    loaded_model, loaded_scaler, loaded_label_encoder, ac_df,
    preferred_physical_type=user_preferred_type
)

#ts 3 prefer "Ceiling_Cassette" 
print("\n--- Scenario 3: User Prefers 'Ceiling_Cassette' (Small Room, High Ceiling) ---")
user_temp_2 = 26; user_humid_2 = 85; user_area_2 = 10; user_height_2 = 3.5
user_preferred_type_2 = 'Ceiling_Cassette'
final_cat_pref_2, recs_pref_2 = get_ac_recommendation_hybrid(
    user_temp_2, user_humid_2, user_area_2, user_height_2,
    loaded_model, loaded_scaler, loaded_label_encoder, ac_df,
    preferred_physical_type=user_preferred_type_2
)

#ts 4 no good match
print("\n--- Scenario 4: User Prefers 'Floor_Standing' (Large Area) ---")
user_temp_3 = 25; user_humid_3 = 60; user_area_3 = 40; user_height_3 = 2.8
user_preferred_type_3 = 'Floor_Standing'
final_cat_pref_3, recs_pref_3 = get_ac_recommendation_hybrid(
    user_temp_3, user_humid_3, user_area_3, user_height_3,
    loaded_model, loaded_scaler, loaded_label_encoder, ac_df,
    preferred_physical_type=user_preferred_type_3
)

#ts 5 no preference
print("\n--- Scenario 5: ML Suggestion for Very Large Space (no user preference) ---")
user_temp_4 = 35; user_humid_4 = 75; user_area_4 = 60; user_height_4 = 3.0
final_cat_4, recs_4 = get_ac_recommendation_hybrid(
    user_temp_4, user_humid_4, user_area_4, user_height_4,
    loaded_model, loaded_scaler, loaded_label_encoder, ac_df
)




--- Saving Model and Encoders ---
Model, Label Encoder, and Scaler saved (v4 - Hybrid).

--- Example Recommendation (Hybrid) ---

--- Scenario 1: No User Preference for Physical Type ---

Input: Temp=28°C, Humid=70%, Area=20m², Height=2.5m, Volume=50.00m³
Calculated Required BTU: 31800.00
ML's Initial Suggested AC Type Category: Very_Large_Ceiling_Cassette
No user preference for physical type. Using ML's suggestion: 'Very_Large_Ceiling_Cassette'

Recommended AC Models (Top 5):
            AC_Name  BTU_Capacity  Power_Input_kW  \
5   PLFY-P100VEM-PA       38200.0           0.070   
21  PLFY-P100VLMD-E       38200.0           0.157   

               AC_Type_Category     Physical_Type  
5   Very_Large_Ceiling_Cassette  Ceiling_Cassette  
21  Very_Large_Ceiling_Cassette  Ceiling_Cassette  

--- Scenario 2: User Prefers 'Wall_Mounted' ---

Input: Temp=28°C, Humid=70%, Area=20m², Height=2.5m, Volume=50.00m³
Calculated Required BTU: 31800.00
ML's Initial Suggested AC Type Category: Very_Lar