<a href="https://colab.research.google.com/github/Lee-Minsoo-97/Machine-Learning/blob/main/5_Different_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split



pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns


In [None]:
path_train = "/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/train.csv"
path_test = "/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/test.csv"

train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)

In [None]:
train_df.columns

Index(['QuoteNumber', 'Original_Quote_Date', 'QuoteConversion_Flag', 'Field6',
       'Field7', 'Field8', 'Field9', 'Field10', 'Field11', 'Field12',
       ...
       'GeographicField59A', 'GeographicField59B', 'GeographicField60A',
       'GeographicField60B', 'GeographicField61A', 'GeographicField61B',
       'GeographicField62A', 'GeographicField62B', 'GeographicField63',
       'GeographicField64'],
      dtype='object', length=299)

In [None]:
test_df.columns

Index(['QuoteNumber', 'Original_Quote_Date', 'Field6', 'Field7', 'Field8',
       'Field9', 'Field10', 'Field11', 'Field12', 'CoverageField1A',
       ...
       'GeographicField59A', 'GeographicField59B', 'GeographicField60A',
       'GeographicField60B', 'GeographicField61A', 'GeographicField61B',
       'GeographicField62A', 'GeographicField62B', 'GeographicField63',
       'GeographicField64'],
      dtype='object', length=298)

In [None]:

# Drop unnecessary columns
train_df = train_df.drop(columns=["QuoteNumber", "Original_Quote_Date"])
test_df = test_df.drop(columns=["QuoteNumber", "Original_Quote_Date"])

# Align train and test columns
common_columns = list(set(train_df.columns) & set(test_df.columns))
train_df = train_df[common_columns + ["QuoteConversion_Flag"]]
test_df = test_df[common_columns]

# Separate features and target
X = train_df.drop(columns=["QuoteConversion_Flag"])
y = train_df["QuoteConversion_Flag"]

# Handle missing values
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="constant", fill_value="missing")

categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(exclude=["object"]).columns

X[categorical_columns] = X[categorical_columns].astype(str)
test_df[categorical_columns] = test_df[categorical_columns].astype(str)

X[numerical_columns] = numerical_imputer.fit_transform(X[numerical_columns])
test_df[numerical_columns] = numerical_imputer.transform(test_df[numerical_columns])

X[categorical_columns] = categorical_imputer.fit_transform(X[categorical_columns])
test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

# Encode categorical and scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    ]
)

In [None]:
X_processed = preprocessor.fit_transform(X)
test_processed = preprocessor.transform(test_df)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_processed, y)

print("SMOTE applied successfully.")

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Dictionary to store results
model_results = {}

SMOTE applied successfully.


In [None]:
# Function to train a model and calculate AUC
def train_model(model, model_name):
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    train_pred = model.predict_proba(X_train)[:, 1]
    val_pred = model.predict_proba(X_val)[:, 1]
    test_pred = model.predict_proba(test_processed)[:, 1]

    # Store results
    model_results[model_name] = {
        "Train AUC": roc_auc_score(y_train, train_pred),
        "Validation AUC": roc_auc_score(y_val, val_pred),
        "Test Predictions": test_pred
    }
    print(f"{model_name} - Train AUC: {model_results[model_name]['Train AUC']:.4f}, "
          f"Validation AUC: {model_results[model_name]['Validation AUC']:.4f}")
    return train_pred, val_pred, test_pred

In [None]:
# Train models
dt_train, dt_val, dt_test = train_model(
    DecisionTreeClassifier(random_state=42, max_depth=10), "Decision Tree"
)

Training Decision Tree...
Decision Tree - Train AUC: 0.9839, Validation AUC: 0.9833


In [None]:
rf_train, rf_val, rf_test = train_model(
    RandomForestClassifier(random_state=42, n_estimators=50, n_jobs=-1), "Random Forest"
)


Training Random Forest...
Random Forest - Train AUC: 1.0000, Validation AUC: 0.9910


In [None]:
from sklearn.decomposition import PCA

# 데이터 샘플링
sample_size = 10000
X_train_sample = X_train[:sample_size]
y_train_sample = y_train[:sample_size]

# PCA 차원 축소
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_sample)
X_val_pca = pca.transform(X_val)
test_processed_pca = pca.transform(test_processed)

# SVM 실행
print("Training Optimized Support Vector Machines...")
optimized_svm = SVC(probability=True, random_state=42, kernel='linear', max_iter=200)
optimized_svm.fit(X_train_pca, y_train_sample)

# 예측 생성
svm_train_pred = optimized_svm.predict_proba(X_train_pca)[:, 1]
svm_val_pred = optimized_svm.predict_proba(X_val_pca)[:, 1]
svm_test_pred = optimized_svm.predict_proba(test_processed_pca)[:, 1]

# AUC 계산
train_auc = roc_auc_score(y_train_sample, svm_train_pred)
val_auc = roc_auc_score(y_val, svm_val_pred)
print(f"Optimized SVM - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

Training Optimized Support Vector Machines...




Optimized SVM - Train AUC: 0.6013, Validation AUC: 0.6069


In [None]:
from sklearn.decomposition import PCA

# PCA 차원 축소 (100개 주요 성분만 유지)
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
test_processed_pca = pca.transform(test_processed)

# 최적화된 MLP
print("Training Optimized Multilayer Perceptron...")
optimized_mlp = MLPClassifier(
    random_state=42,
    max_iter=100,              # 반복 횟수 제한
    hidden_layer_sizes=(20, 10),  # 작은 히든 레이어
    learning_rate_init=0.01,   # 초기 학습률 증가
    solver='adam',             # 기본 solver 유지
)

# 모델 학습
optimized_mlp.fit(X_train_pca, y_train)

# 예측 생성
mlp_train_pred = optimized_mlp.predict_proba(X_train_pca)[:, 1]
mlp_val_pred = optimized_mlp.predict_proba(X_val_pca)[:, 1]
mlp_test_pred = optimized_mlp.predict_proba(test_processed_pca)[:, 1]

# AUC 계산
train_auc = roc_auc_score(y_train, mlp_train_pred)
val_auc = roc_auc_score(y_val, mlp_val_pred)
print(f"Optimized MLP - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")


Training Optimized Multilayer Perceptron...




Optimized MLP - Train AUC: 0.9770, Validation AUC: 0.9760


In [None]:
from sklearn.decomposition import PCA

# 데이터 크기 줄이기 (10,000개 샘플만 사용)
sample_size = 10000
X_train_sample = X_train[:sample_size]
y_train_sample = y_train[:sample_size]

# PCA 차원 축소 (50개 주요 성분만 유지)
pca = PCA(n_components=50, random_state=42)
X_train_pca = pca.fit_transform(X_train_sample)
X_val_pca = pca.transform(X_val)
test_processed_pca = pca.transform(test_processed)

# 최적화된 KNN
print("Training Optimized K-Nearest Neighbors...")
optimized_knn = KNeighborsClassifier(
    n_neighbors=3,   # 이웃 수 줄이기
    metric='manhattan'  # 계산이 간단한 거리 측정 방식
)

# KNN 모델 학습
optimized_knn.fit(X_train_pca, y_train_sample)

# 예측 생성
knn_train_pred = optimized_knn.predict_proba(X_train_pca)[:, 1]
knn_val_pred = optimized_knn.predict_proba(X_val_pca)[:, 1]
knn_test_pred = optimized_knn.predict_proba(test_processed_pca)[:, 1]

# AUC 계산
train_auc = roc_auc_score(y_train_sample, knn_train_pred)
val_auc = roc_auc_score(y_val, knn_val_pred)
print(f"Optimized KNN - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")


Training Optimized K-Nearest Neighbors...
Optimized KNN - Train AUC: 0.9434, Validation AUC: 0.7774


In [None]:
# 길이 확인
print(f"Decision Tree Train Length: {len(dt_train)}")
print(f"Random Forest Train Length: {len(rf_train)}")
print(f"SVM Train Length: {len(svm_train_pred)}")
print(f"MLP Train Length: {len(mlp_train_pred)}")
print(f"KNN Train Length: {len(knn_train_pred)}")

# 배열 길이 강제 정렬
min_length = min(len(dt_train), len(rf_train), len(svm_train_pred), len(mlp_train_pred), len(knn_train_pred))
dt_train = dt_train[:min_length]
rf_train = rf_train[:min_length]
svm_train_pred = svm_train_pred[:min_length]
mlp_train_pred = mlp_train_pred[:min_length]
knn_train_pred = knn_train_pred[:min_length]




Decision Tree Train Length: 338974
Random Forest Train Length: 338974
SVM Train Length: 10000
MLP Train Length: 338974
KNN Train Length: 10000


In [None]:
from sklearn.linear_model import LogisticRegression
# 스태킹 피처 생성
stacking_features_train = pd.DataFrame({
    "Decision Tree": dt_train,
    "Random Forest": rf_train,
    "Support Vector Machines": svm_train_pred,
    "Multilayer Perceptron": mlp_train_pred,
    "K-Nearest Neighbors": knn_train_pred
})

stacking_features_val = pd.DataFrame({
    "Decision Tree": dt_val,
    "Random Forest": rf_val,
    "Support Vector Machines": svm_val_pred,
    "Multilayer Perceptron": mlp_val_pred,
    "K-Nearest Neighbors": knn_val_pred
})

stacking_features_test = pd.DataFrame({
    "Decision Tree": dt_test,
    "Random Forest": rf_test,
    "Support Vector Machines": svm_test_pred,
    "Multilayer Perceptron": mlp_test_pred,
    "K-Nearest Neighbors": knn_test_pred
})



In [None]:
# 원본 테스트 데이터 불러오기
original_test_df = pd.read_csv(path_test)

# 복구된 QuoteNumber를 test_df에 추가
test_df["QuoteNumber"] = original_test_df["QuoteNumber"]


  test_df["QuoteNumber"] = original_test_df["QuoteNumber"]


In [None]:
# Train Meta-Model with sampled data
print("Training Meta-Model (Logistic Regression)...")
meta_model = LogisticRegression(random_state=42)
meta_model.fit(stacking_features_train, y_train_sample)  # 샘플링된 y_train 사용

# Evaluate Meta-Model
stacked_val_predictions = meta_model.predict_proba(stacking_features_val)[:, 1]
stacked_test_predictions = meta_model.predict_proba(stacking_features_test)[:, 1]

# Calculate Validation AUC for Meta-Model
stacked_val_auc = roc_auc_score(y_val, stacked_val_predictions)
print(f"Stacked Model - Validation AUC: {stacked_val_auc:.4f}")

# Save Submission File
submission = pd.DataFrame({
    "QuoteNumber": test_df["QuoteNumber"],  # 복구된 QuoteNumber 사용
    "QuoteConversion_Flag": stacked_test_predictions
})
submission.to_csv("Stacked_Model_submission.csv", index=False)
print("Stacked model submission file created.")


Training Meta-Model (Logistic Regression)...
Stacked Model - Validation AUC: 0.9906
Stacked model submission file created.
