# 회귀

In [1]:
import pandas as pd
from scipy import stats
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

def load_dataset_regression():
    """
    회귀 모델을 위한 데이터셋 전처리
    """
    # 데이터셋 불러오기
    data = pd.read_csv(r'./data/Regression_data.csv')

    # 'Sex' 열을 원-핫 인코딩으로 변환
    #data = pd.get_dummies(data, columns=['Sex'], drop_first=True)
    data['Sex'] = data['Sex'].replace({'M':0, 'F':1, 'I':2})

    # 이상치 제거를 위해 확인할 열 선택
    columns_to_check = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

    # z-점수를 이용하여 이상치 제거
    z_scores = stats.zscore(data[columns_to_check])
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3).all(axis=1)
    data = data[filtered_entries]
    df =data

    # "전체 무게 >= 조개껍질 벗긴 무게 + 내장 무게 + 껍질 무게"를 만족하지 않는 행들 제거
    df = df[df['Whole weight'] >= df['Shucked weight'] + df['Viscera weight'] + df['Shell weight']]
    df = df.reset_index(drop=True)
    print(df)

    # 전처리된 데이터를 반환
    return df


def xgboost(df):
    y_target = df['Rings']
    X_features = df.drop('Rings',axis=1, inplace=False)

    X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)

    xgb_reg = XGBRegressor(n_estimators=400,
                        learning_rate=0.01,
                        colsample_bytree=0.8,
                        subsample=0.2,
                        gamma=0.2,
                        max_depth=5,
                        min_child_weight=5)

    xgb_reg.fit(X_train, y_train)

    y_pred = xgb_reg.predict(X_test)

    # 정확도 계산
    acc = np.mean(1 - np.abs((y_pred - y_test) / y_test))
    print('XGBOOST')
    print(f"Mean Accuracy: {acc:.4f}")
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE: {mse:.4f}")
    r2 = r2_score(y_test, y_pred)
    print(f"R2 Score: {r2:.4f}")
    return xgb_reg

def main():
    reg_dataset = load_dataset_regression()

    a = xgboost(reg_dataset)
    with open('regression.pkl', 'wb') as f:
        pickle.dump(a, f)

if __name__ == "__main__":
    main()

      Sex  Length  Diameter  Height  Whole weight  Shucked weight  \
0       0   0.455     0.365   0.095        0.5140          0.2245   
1       0   0.350     0.265   0.090        0.2255          0.0995   
2       1   0.530     0.420   0.135        0.6770          0.2565   
3       0   0.440     0.365   0.125        0.5160          0.2155   
4       2   0.330     0.255   0.080        0.2050          0.0895   
...   ...     ...       ...     ...           ...             ...   
3871    1   0.565     0.450   0.165        0.8870          0.3700   
3872    0   0.590     0.440   0.135        0.9660          0.4390   
3873    0   0.600     0.475   0.205        1.1760          0.5255   
3874    1   0.625     0.485   0.150        1.0945          0.5310   
3875    0   0.710     0.555   0.195        1.9485          0.9455   

      Viscera weight  Shell weight  Rings  
0             0.1010        0.1500     15  
1             0.0485        0.0700      7  
2             0.1415        0.2100     

# 이진분류

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle


def load_dataset_binary_classification():
    """
    이진 분류: 데이터 불균형 문제 (1 희소)
    """
    global df
    df = pd.read_csv(r'./data/binary_classification_data.csv')

    df_1 = df.iloc[:, :-1]
    standard_scaler = StandardScaler()
    np_scaled = standard_scaler.fit_transform(df_1)
    df_norm = pd.DataFrame(np_scaled, columns=list(df_1.columns))

    # 이상치 제거
    low, high = .05, .95
    quantiles = df_norm.quantile([low, high])
    quantile_norm = df_norm.apply(lambda col: col[(col >= quantiles.loc[low, col.name]) &
                                                 (col <= quantiles.loc[high, col.name])], axis=0)
    X = df_norm
    targets = df['target_class']
    le = LabelEncoder()
    Y = le.fit_transform(targets)
    Y = pd.Series(Y, name='target_class')

    ros = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X, Y)
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled['targets'] = y_resampled
    df = df_resampled

    return df

def binary_model(df):
    # X, y 분리
    X = df.drop(columns='targets')
    y = df['targets']

    # Train, Test Dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

    # Modeling - XGBoost Classifier
    model = XGBClassifier()
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Test
    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)

    # Classification Report
    model_pred = model.predict(X_test)
    report = classification_report(y_test, model_pred)
    accuracy = round(model.score(X_test, y_test) * 100, 1)

    print("binary Report:")
    print(report)
    print(f'BinaryClassifier: class 조절 정확도 (accuracy) {accuracy}%')

    return binary_model




def predict_new_data(df, new_data, scaler):
    # X, y 분리
    X = df.drop(columns='targets')
    y = df['targets']

    # Modeling - XGBoost Classifier
    model = XGBClassifier()
    model.fit(X, y)

    # Scaling the new_data using the provided scaler
    new_data_scaled = scaler.transform(new_data)

    # Probability estimates for the new data points
    prediction_proba = model.predict_proba(new_data_scaled)

    for idx, (pred, proba) in enumerate(zip(model.predict(new_data_scaled), prediction_proba)):
        if pred == 1:
            print(f"Data point {idx+1}: Predicted Class: 1 (Pulsar), Probability: {proba[1]*100:.2f}%")
            a = round(proba[1]*100, 2)
        else:
            print(f"Data point {idx+1}: Predicted Class: 0 (NOt Pulsar), Probability: {proba[0]*100:.2f}%")
            b = round(proba[0]*100, 2)
    return a, b

# NO Pulsar 표현 말고 다른 표현 좋은거 있으면 추천해주세요
def main():
    # 데이터셋 로드 및 전처리
    df = load_dataset_binary_classification()

    # 모델 학습 및 평가
    a = binary_model(df)
    with open('model_binary_class', 'wb') as f:
        pickle.dump(a, f)

if __name__ == "__main__":
    # 새로운 데이터로 예측
    main()
    new_data = pd.DataFrame({
        ' Mean of the integrated profile': [120, 60],
        ' Standard deviation of the integrated profile': [50, 40],
        ' Excess kurtosis of the integrated profile': [0, 3.0],
        ' Skewness of the integrated profile': [0, 10],
        ' Mean of the DM-SNR curve': [0, 10],
        ' Standard deviation of the DM-SNR curve': [20, 70],
        ' Excess kurtosis of the DM-SNR curve': [2, 10],
        ' Skewness of the DM-SNR curve': [10, 110]
    })

    # Use the same StandardScaler used for training data
    standard_scaler = StandardScaler()
    X_scaled = standard_scaler.fit_transform(df.drop(columns='targets'))
    a, b = predict_new_data(df, new_data, standard_scaler)
    print(a, b)


ImportError: cannot import name 'RandomUnderSampler' from 'imblearn.over_sampling' (c:\Users\khan\anaconda3\envs\sec6_pj1\lib\site-packages\imblearn\over_sampling\__init__.py)

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle


def load_dataset_binary_classification():
    """
    이진 분류: 데이터 불균형 문제 (1 희소)
    """
    global df
    df = pd.read_csv(r'./data/binary_classification_data.csv')

    df_1 = df.iloc[:, :-1]
    standard_scaler = StandardScaler()
    np_scaled = standard_scaler.fit_transform(df_1)
    df_norm = pd.DataFrame(np_scaled, columns=list(df_1.columns))

    # 이상치 제거
    low, high = .05, .95
    quantiles = df_norm.quantile([low, high])
    quantile_norm = df_norm.apply(lambda col: col[(col >= quantiles.loc[low, col.name]) &
                                                 (col <= quantiles.loc[high, col.name])], axis=0)
    X = df_norm
    targets = df['target_class']
    le = LabelEncoder()
    Y = le.fit_transform(targets)
    Y = pd.Series(Y, name='target_class')

    df = pd.DataFrame(X, columns=X.columns)
    df['targets'] = Y
    return df

def binary_model(df):
    df_origin = df.copy()

    X = df.drop(columns='targets')
    Y = df['targets']

    # RandomUnderSampler
    ros = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X, Y)
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled['targets'] = y_resampled
    df = df_resampled

    # X, y 분리
    X_origin = df_origin.drop(columns='targets')
    y_origin = df_origin['targets']

    # Train, Test Dataset
    X_origin_train, X_origin_test, y_origin_train, y_origin_test = train_test_split(X_origin, y_origin, test_size=0.2, random_state=2)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

    # Modeling - XGBoost Classifier
    model = XGBClassifier()
    model.fit(X_train, y_train)

    # Classification Report
    model_pred = model.predict(X_origin_test)
    report = classification_report(y_origin_test, model_pred)
    accuracy = round(model.score(X_origin_test, y_origin_test) * 100, 1)

    print("binary Report:")
    print(report)
    print(f'BinaryClassifier: class 조절 정확도 (accuracy) {accuracy}%')

    return model




def predict_new_data(df, new_data, scaler):
    # X, y 분리
    X = df.drop(columns='targets')
    y = df['targets']

    # Modeling - XGBoost Classifier
    model = XGBClassifier()
    model.fit(X, y)

    # Scaling the new_data using the provided scaler
    new_data_scaled = scaler.transform(new_data)

    # Probability estimates for the new data points
    prediction_proba = model.predict_proba(new_data_scaled)

    for idx, (pred, proba) in enumerate(zip(model.predict(new_data_scaled), prediction_proba)):
        if pred == 1:
            print(f"Data point {idx+1}: Predicted Class: 1 (Pulsar), Probability: {proba[1]*100:.2f}%")
        else:
            print(f"Data point {idx+1}: Predicted Class: 0 (NOt Pulsar), Probability: {proba[0]*100:.2f}%")

# NO Pulsar 표현 말고 다른 표현 좋은거 있으면 추천해주세요

def main():
    # 데이터셋 로드 및 전처리
    df = load_dataset_binary_classification()
    
    #모델 학습 및 평가
    a = binary_model(df)
    with open('binary.pkl', 'wb') as f:
        pickle.dump(a, f)


if __name__ == "__main__":
    # 새로운 데이터로 예측
    main()
    new_data = pd.DataFrame({
        ' Mean of the integrated profile': [120, 60],
        ' Standard deviation of the integrated profile': [50, 40],
        ' Excess kurtosis of the integrated profile': [0, 3.0],
        ' Skewness of the integrated profile': [0, 10],
        ' Mean of the DM-SNR curve': [0, 10],
        ' Standard deviation of the DM-SNR curve': [20, 70],
        ' Excess kurtosis of the DM-SNR curve': [2, 10],
        ' Skewness of the DM-SNR curve': [10, 110]
    })
    # Use the same StandardScaler used for training data
    standard_scaler = StandardScaler()
    X_scaled = standard_scaler.fit_transform(df.drop(columns='targets'))
    
    # 스케일러 객체 저장
    with open('binary_class_scaler.pkl', 'wb') as f:
        pickle.dump(standard_scaler, f)
    
    predict_new_data(df, new_data, standard_scaler)



binary Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3256
           1       0.91      0.82      0.86       324

    accuracy                           0.98      3580
   macro avg       0.95      0.91      0.93      3580
weighted avg       0.98      0.98      0.98      3580

BinaryClassifier: class 조절 정확도 (accuracy) 97.7%
Data point 1: Predicted Class: 1 (Pulsar), Probability: 66.03%
Data point 2: Predicted Class: 1 (Pulsar), Probability: 98.28%


# 다중 분류

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import SMOTE
import pickle


def load_dataset_multi_classification(): 
    df = pd.read_csv(r'./data/mulit_classification_data.csv')

    # 컬럼 전처리
    df['type'] = 'TypeOfSteel_A300'
    df.loc[df['TypeOfSteel_A400'] == 1, 'type'] = 'TypeOfSteel_A400'
    df.drop(['TypeOfSteel_A300', 'TypeOfSteel_A400'], axis=1, inplace=True)
    df['type'].replace({"TypeOfSteel_A300":0,"TypeOfSteel_A400":1},inplace=True)

    # X_Perimeter + Y_Perimeter = Total_Perimeter 
    df['Total_Perimeter'] = df['X_Perimeter'] + df['Y_Perimeter']
    df.drop(['X_Perimeter', 'Y_Perimeter'], axis=1, inplace=True)

    # Mean_of_Luminosity 컬럼으로 합치기
    df['Mean_of_Luminosity'] = (df['Minimum_of_Luminosity'] + df['Maximum_of_Luminosity']) / 2
    df.drop(['Minimum_of_Luminosity', 'Maximum_of_Luminosity'], axis=1, inplace=True)

    # target 데이터 -> int bool 타입으로 변경 
    target_df = [
        df['Pastry'],
        df['Z_Scratch'],
        df['K_Scatch'],
        df['Stains'],
        df['Dirtiness'],
        df['Bumps'],
        df['Other_Faults'] 
    ]
    targets = list(map(lambda i: i.astype(bool), target_df))
    choices = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    df.drop(df[choices].columns, axis=1, inplace=True)
    df['class'] = np.select(targets, choices)

    # class외의 독립변수 데이터 표준화
    df_1 = df.iloc[:, :-1]
    # StandardScaler 객체 생성
    standard_scaler = StandardScaler()
    np_scaled = standard_scaler.fit_transform(df_1)
    df_norm = pd.DataFrame(np_scaled, columns=list(df_1.columns))

    # 이상치 제거 
    low, high = .05, .95
    quantiles = df_norm.quantile([low, high])
    quantile_norm = df_norm.apply(lambda col: col[(col >= quantiles.loc[low, col.name]) & 
                                        (col <= quantiles.loc[high, col.name])], axis=0)

    # 상관계수 행렬 생성
    corr_matrix = df_norm.corr().abs()
    # 상삼각 행렬 부분(대각선 기준으로 위쪽)만 남기기 위해 적용
    under = corr_matrix * (np.triu(np.ones(corr_matrix.shape), k=1))
    # 상관계수가 0.95보다 큰 변수들 찾아서 제거
    to_drop = [column for column in under.columns if any(under[column] > 0.95)]
    df_norm = df_norm.drop(df_norm[to_drop], axis=1)

    # target 데이터 LabelEncoder
    X = df_norm
    le = LabelEncoder()

    # df_norm DataFrame에서 'class' 컬럼을 범주형 타겟 데이터로 사용
    targets = df['class']
    Y = le.fit_transform(targets)

    # X와 Y를 하나의 데이터프레임으로 합치기 위해 Y를 Series로 변환하고, 열 이름을 'target'으로 지정
    Y = pd.Series(Y, name='targets')

    # 클래스 비중 조절을 위한 RandomOverSampler 객체 생성
    smote = SMOTE(random_state=0)

    # 클래스 비중 조절을 위해 fit_resample() 메서드를 사용하여 X_train, y_train을 샘플링
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    # X_resampled와 y_resampled를 DataFrame으로 변환
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)

    # 'targets' 컬럼 추가
    df_resampled['targets'] = y_resampled
    df = df_resampled
    return df



def multi_classification(df):  
    # target 데이터 LabelEncoder
    X = df.drop("targets", axis=1)
    y = df['targets']

    # 데이터를 훈련용(train)과 테스트용(test)으로 분리
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # RandomForestClassifier 모델 생성
    model = RandomForestClassifier(n_estimators=300, 
                                   max_depth=30,
                                   max_features='sqrt',
                                   min_samples_leaf=1,
                                   min_samples_split=5,
                                   bootstrap=False,
                                   class_weight='balanced',
                                   random_state=42)

    # 모델 학습
    model.fit(X_train, y_train)

    # Feature Importance 확인
    feature_importance = model.feature_importances_
    feature_names = X_train.columns

    # 중요도가 낮은 피처 제거
    threshold = 0.05  # 임계값 (임의로 설정, 조정 가능)
    selected_features = feature_names[feature_importance <= threshold]
    
    # 임계값보다 작은 Feature Importance를 가진 피처들만 출력
    #print("Features with Importance <= Threshold:")
    #for feature in selected_features:
    #print(feature)

    X_train_selected = X_train.drop(selected_features, axis=1)
    X_test_selected = X_test.drop(selected_features, axis=1)

    # 모델 다시 학습
    model.fit(X_train_selected, y_train)

    # 모델 평가 및 결과 출력
    model_pred = model.predict(X_test_selected)
    report = classification_report(y_test, model_pred)
    accuracy = round(model.score(X_test_selected, y_test) * 100, 2)

    print("Selected Features:", X_train_selected.columns)
    print("Classification Report:")
    print(report)
    print(f'RandomForestClassifier: class 조절 정확도 (accuracy) {accuracy}%')
    return model



def main():
    df = load_dataset_multi_classification()
    a = multi_classification(df)
    with open('model_classification', 'wb') as f:
        pickle.dump(a, f)

if __name__ == "__main__":
    main()


Selected Features: Index(['X_Minimum', 'Pixels_Areas', 'Length_of_Conveyer',
       'Steel_Plate_Thickness', 'Outside_X_Index', 'LogOfAreas', 'Log_X_Index',
       'Orientation_Index'],
      dtype='object')
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.80       124
           1       0.99      0.97      0.98       144
           2       0.98      0.95      0.97       126
           3       0.82      0.78      0.80       134
           4       0.88      0.91      0.89       139
           5       0.99      1.00      1.00       134
           6       0.97      0.99      0.98       142

    accuracy                           0.92       943
   macro avg       0.92      0.92      0.92       943
weighted avg       0.92      0.92      0.92       943

RandomForestClassifier: class 조절 정확도 (accuracy) 91.73%
