In [1]:
from sdv.datasets.demo import download_demo
real_data, metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests')
    

In [2]:
from sdv.lite import SingleTablePreset

synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=real_data)

In [176]:
from typing import List, Tuple
from sdv.single_table import CTGANSynthesizer
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sdv.metadata import SingleTableMetadata
from sklearn.neighbors import KernelDensity
from sdv.lite import SingleTablePreset

# push + prd

# imbalanced에 data level로 해결하는 모델
class FiGen:
    def __init__(self, ratio: float, index: List[str]):
        """
        고정적으로 사용하는 값을 저장
        
        Args:
            ratio (float): small class+생성된 데이터와 large class의 비율 
            index (List[int]): 범주형, 연속형 구분하기 위한 연속형 변수의 컬럼명 인덱스       
        """
        self.result = 0
        self.ratio = ratio
        self.index = index
    

    def extract_middle_percent(self, data: pd.DataFrame, start: float, last:float):
        """
        데이터의 분포 중 중간 부분을 추출 
        
        Args:
            data : 입력 데이터
            start : 추출 시작 percentile 
            last : 추출 끝 percentile
        Returns:    
            데이터의 분포 중 중간 부분을 추출하여 리턴
        """
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data.values)
        kde = KernelDensity(kernel="gaussian", bandwidth=0.5).fit(
            data_scaled
        )  ##TODO: 계산이 안터지도록 하기, gmm으로 변경
        log_prob = kde.score_samples(data_scaled)
        prob = np.exp(log_prob)
        threshold_low, threshold_high = np.percentile(prob, [start, last])
        mask = np.logical_and(prob >= threshold_low, prob <= threshold_high)
        data_middle = data[mask]

        if len(data_middle) > 0:
            return data_middle
        else:
            print("No middle 50% found, returning original data")
            return []
        
    def find_categorical(
        self, suitable_generated_small_X: pd.DataFrame, categorical_small_X: pd.DataFrame, small_X: pd.DataFrame
    ):  
        """
        생성된 연속형변수와 기존 연속형 변수의 cosine simmilarity를 기준으로 가장 가까운 기존 변수를 찾은 후 해당 변수의 범주형 값을 가져옴
        
        Args:
            suitable_generated_small_X : 생성된 적합한 small class의 연속형 변수만 있는 x 
            small_X : small class의 연속형, 범주형 변수가 모두 있는 orgin x
        Returns:
            생성된 연속 변수를 범주형 변수값이 결합된 형태로 리턴 
        """

        # Min-Max 스케일링을 위한 객체 생성
        scaler = MinMaxScaler()


        # 열별 Min-Max 스케일링 수행
        suitable_generated_small_scaled_X = pd.DataFrame(
            scaler.fit_transform(suitable_generated_small_X),
            columns=suitable_generated_small_X.columns,
        )
 
        orgin_small_non_cat_scaled_X = pd.DataFrame(
            scaler.fit_transform(small_X[self.index]),
            columns=self.index
        )

        # 데이터프레임을 numpy 배열로 변환
        array_mxn = suitable_generated_small_scaled_X.values
        array_kxn = orgin_small_non_cat_scaled_X.values
    
        # 코사인 유사도 계산
        cosine_similarities = cosine_similarity(array_mxn, array_kxn)
      
        # 각 행에서 최대값을 가지는 열의 인덱스를 가져와서 리스트로 만들기
        max_indices = np.argmax(cosine_similarities, axis=1).tolist()
        

        # 중복된 행 인덱스에 해당하는 데이터 선택하여 배열에 저장
        combined_rows = np.concatenate([
        suitable_generated_small_X,
        categorical_small_X.values[max_indices]
        ], axis=1)

        # 모아진 행들을 데이터프레임으로 변환하여 synthetic_small_X 생성
        column_names = (
            suitable_generated_small_scaled_X.columns.tolist() +
            categorical_small_X.columns.tolist()
            )
        synthetic_small_X = pd.DataFrame(combined_rows, columns=column_names)

        return synthetic_small_X

    def suitable_judge(self, midlle_small_X:pd.DataFrame, small_X: pd.DataFrame, large_X: pd.DataFrame):
        """
           generated_x : 생성된 small class x 데이터
           small_X : 원본 small class x 데이터 
           large_X : 원본 large class x 데이터
        """
        # 연속형small x로 뽑아야함
        center_small_X = np.mean(
            small_X[self.index].values, axis=0, dtype=np.float64, out=None 
        )
        radius_small_X = np.max(
            np.linalg.norm(small_X[self.index].values - center_small_X, axis=1)
        )

        center_large_X = np.mean(
            large_X[self.index].values, axis=0, dtype=np.float64, out=None 
        )

        radius_large_X = np.max(
            np.linalg.norm(large_X[self.index].values - center_large_X, axis=1)
        )

        synthetic_sample = pd.DataFrame()  # 최종 합치기
       

        # ctgan으로 연속형 생성 부분
        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(data=midlle_small_X)
        
        synthesizer = SingleTablePreset(metadata, name='FAST_ML')
        synthesizer.fit(data=midlle_small_X)
        
        
        # 합성된 개수 / 원래 large 클래스 개수 <= ratio 만족시 그만 생성    
        
        while len(synthetic_sample) / len(large_X) < self.ratio:

            # large class의 데이터 사이즈 10배 만큼 데이터 생성
            synthetic_data = synthesizer.sample(num_rows=len(large_X))  

            synthetic_samples_to_generate = int((self.ratio - len(synthetic_sample) / len(large_X)) * len(large_X))
            if synthetic_samples_to_generate == 0:
                break  # 더 이상 생성이 필요하지 않을 경우 루프를 빠져나감
            z = synthetic_data.iloc[:synthetic_samples_to_generate]  # 벡터화된 방식으로 일괄 처리
        
            distances_small = np.linalg.norm(z.values[:, np.newaxis, :] - center_small_X, axis=2)
            distances_large = np.linalg.norm(z.values[:, np.newaxis, :] - center_large_X, axis=2)
        
            small_condition = distances_small < radius_small_X
            large_condition = distances_large < radius_large_X

            # 생성된 small class 데이터가 small, large class 중 small에 가까운지, small class의 지름을 넘지는 않는지
            condition = np.logical_and(small_condition, distances_small < distances_large)
        
            synthetic_sample = pd.concat([synthetic_sample, z[condition]])
            
        return synthetic_sample.reset_index(drop=True)
    
    
    def generate_synthetic(
        self, small_X: pd.DataFrame, large_X: pd.DataFrame, small_Y: pd.DataFrame, large_Y: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.Series]:
        """
        생성된 데이터셋 + 기존 데이터셋을 합쳐 통합 데이터셋을 생성
        
        Args:
            small_X (pd.DataFrame): small class의 x
            large_X (pd.DataFrame): large class의 x
        Returns:
            생성된 데이터셋 + 기존 데이터셋을 합쳐 통합 데이터셋을 리턴
        """

        # Nan 값 제거 요청 
        assert not large_X.isnull().values.any(), "large_X 입력 데이터에 NaN 값이 포함되어 있습니다." 
        assert not small_X.isnull().values.any(), "small_X 입력 데이터에 NaN 값이 포함되어 있습니다."    
 
        # 연속형 변수만 가져오는 부분
        continue_small_X = small_X[self.index]
        continue_large_X = large_X[self.index]

        # 범주형 변수만 가져오는 부분
        categorical_small_X = small_X[list(set(small_X.columns) - set(self.index))]
        categorical_large_X = large_X[list(set(small_X.columns) - set(self.index))]

        # 상위 n% 필터링 부분
        midlle_small_X = self.extract_middle_percent(
            continue_small_X, 25, 75
        )  ##TODO: 추후에 하이퍼 파라미터로 뺄 수 있음
        midlle_large_X = self.extract_middle_percent(
            continue_large_X, 15, 85
        )  ##TODO: 추후에 하이퍼 파라미터로 뺄 수 있음
        
        # 연속형 데이터 생성 및 데이터 적합 판단
        suitable_generated_small_X = self.suitable_judge(midlle_small_X, small_X, large_X)
     
        # 코사인 유사도 기반으로 가장 가까운 기존 변수의 범주형 변수 값 가져오기
        synthetic_small_X = self.find_categorical(
            suitable_generated_small_X, categorical_small_X, small_X 
        )

        # small class와 large class 합치기
        origin_small_x = pd.concat(
            [midlle_small_X, categorical_small_X.loc[midlle_small_X.index]], axis=1
        )
        
        small_total_x = pd.concat([synthetic_small_X, origin_small_x], axis=0)
        
        small_total_x["target"] = small_Y.iloc[:1].values[0][0]

        origin_large_x = pd.concat(
            [midlle_large_X, categorical_large_X.loc[midlle_large_X.index]], axis=1
        )
    
        origin_large_x["target"] = large_Y.iloc[:1].values[0][0]
  
        total = pd.concat([small_total_x, origin_large_x], axis=0)

        return total.drop(columns=["target"]), total["target"]
    
    
    def fit(
        self,
        small_X: pd.DataFrame,
        small_Y: pd.DataFrame,
        large_X: pd.DataFrame,
        large_Y: pd.DataFrame        
    ):
        """
        데이터를 학습 시키는 함수
        Args:
            small_X (pd.DataFrame): small class의 x
            small_Y (pd.DataFrame): small class의 y
            large_X (pd.DataFrame): large class의 x
            large_Y (pd.DataFrame): large class의 y
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: synthetic X, y
        
        """
        # 합성+ 기존 data set 생성
        synthetic_X, synthetic_Y = self.generate_synthetic(
            small_X, large_X, small_Y, large_Y
        )
        return synthetic_X, synthetic_Y


In [172]:
GEN = FiGen(0.3,['amenities_fee','room_rate'])

In [173]:
# y = has_rewards
real_data = real_data.dropna(axis=0)
small_X = real_data[real_data['has_rewards'] == True]
small_Y = real_data[real_data['has_rewards'] == True].iloc[:, [1]]
large_X = real_data[real_data['has_rewards'] == False]
large_Y = real_data[real_data['has_rewards'] == False].iloc[:, [1]]

# 생성된 것 중에 적합헌 것이 없어 넘어가는 과장이 원활히 되지 않음

In [174]:
synthetic_X, synthetic_Y = GEN.fit(small_X, small_Y, large_X, large_Y)

In [175]:
synthetic_X

Unnamed: 0,amenities_fee,room_rate,billing_address,credit_card_number,has_rewards,room_type,checkin_date,guest_email,checkout_date
0,0.0,177.047091,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380,True,DELUXE,17 Sep 2020,webermelissa@neal.com,18 Sep 2020
1,0.0,203.548072,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380,True,DELUXE,17 Sep 2020,webermelissa@neal.com,18 Sep 2020
2,0.0,163.811411,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380,True,DELUXE,17 Sep 2020,webermelissa@neal.com,18 Sep 2020
3,0.0,202.061313,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380,True,DELUXE,17 Sep 2020,webermelissa@neal.com,18 Sep 2020
4,0.0,161.480938,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380,True,DELUXE,17 Sep 2020,webermelissa@neal.com,18 Sep 2020
...,...,...,...,...,...,...,...,...,...
494,7.48,169.72,"7602 John Mill Apt. 258\nDustinbury, DE 85141",4711730811777195574,False,BASIC,11 May 2020,xingram@moses.com,13 May 2020
495,8.71,103.25,"5678 Office Road\nSan Francisco, CA 94103",3505516387300030,False,BASIC,04 Jan 2021,laurabennett@jones-duncan.net,06 Jan 2021
497,30.59,141.61,"5678 Office Road\nSan Francisco, CA 94103",180096250673548,False,BASIC,11 Nov 2020,ygarcia@ballard-lopez.net,13 Nov 2020
498,1.93,136.92,"5678 Office Road\nSan Francisco, CA 94103",4488223821722,False,BASIC,16 Jul 2020,thomasdale@hall.com,18 Jul 2020


In [139]:
small_X

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
2,webermelissa@neal.com,True,DELUXE,0.0,17 Sep 2020,18 Sep 2020,368.33,"0323 Lisa Station Apt. 208\nPort Thomas, LA 82585",38983476971380
5,garciacarol@reid-crawford.biz,True,BASIC,0.0,18 Oct 2020,20 Oct 2020,177.76,"653 Brianna Mill\nBurtonview, TX 66595",4701079720447404938
14,michael21@miller.com,True,BASIC,0.0,01 Oct 2020,05 Oct 2020,83.8,"PSC 3906, Box 8892\nAPO AE 45779",4253047975942
26,hubbardryan@fowler.com,True,BASIC,0.0,26 Aug 2020,27 Aug 2020,101.31,"1234 Corporate Drive\nBoston, MA 02116",4551878586989
27,glen69@thompson.com,True,BASIC,0.0,05 Jul 2020,07 Jul 2020,93.97,"77376 Brett Station Apt. 553\nPort Sandyton, O...",343863920293179
30,kelly94@smith.com,True,DELUXE,0.0,03 Dec 2020,06 Dec 2020,184.39,"776 Parker Locks Suite 350\nPamelamouth, WY 15909",4104817616762
38,toddkaitlin@leon-collier.com,True,BASIC,0.0,20 Feb 2020,23 Feb 2020,98.41,"5678 Office Road\nSan Francisco, CA 94103",4942094262703149
50,hodgeskari@duffy.biz,True,DELUXE,0.0,08 Apr 2020,11 Apr 2020,222.55,"103 Marie Street\nEast Jonathan, OH 98726",2349865661751281
55,sjones@cole.com,True,BASIC,0.0,09 May 2020,10 May 2020,143.57,"9324 Hernandez Rapid Apt. 929\nEast Erin, ID 7...",4847818679193
64,nicole40@cherry.net,True,BASIC,0.0,16 Jan 2020,17 Jan 2020,115.45,"23146 Ramirez Cliffs Apt. 766\nWest Bobbyview,...",180095407605742


In [6]:
con_index = ['amenities_fee','room_rate']

In [181]:
data = large_X[con_index]

In [184]:
import time
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.values)

In [183]:
def extract_kernel(data):
    # 추출할 분위수 범위 설정
    a_percentile = 25
    b_percentile = 75

    # 각 열의 분위수 값 계산
    percentiles = np.percentile(data, [a_percentile, b_percentile], axis=0)

    # 각 열별로 a < x < b 범위에 해당하는 데이터 추출
    condition = np.all((data > percentiles[0, :]) & (data < percentiles[1, :]), axis=1)
    
    return data[condition]

In [191]:
start_time = time.time()
kde = KernelDensity(kernel="gaussian", bandwidth=0.5).fit(
            data_scaled
        ) 
log_prob = kde.score_samples(data_scaled)
prob = np.exp(log_prob)
threshold_low, threshold_high = np.percentile(prob, [25, 75])
mask = np.logical_and(prob >= threshold_low, prob <= threshold_high)
data_middle = data[mask]
print(time.time() - start_time)

0.12257099151611328


In [192]:
start_time = time.time()
self_kde = extract_kernel(data_scaled)
print(time.time() - start_time)

0.010828733444213867


In [193]:
len(self_kde)

97

In [194]:
len(data_middle)

191