In [2]:
#일반 라이브러리
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False


#API를 사용하기 위한 import
# from geoband.API import *
# import folium
# import json
# import geopandas as gpd

# 전처리 및 operator
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


# model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

#xgboost
#from xgboost import XGBRegressor

# cross
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# PCA 데이터 불러오기

In [None]:
PCA_data = pd.read_csv("./PCA.csv")
PCA_data = PCA_data.iloc[:,1:]

total_data = pd.read_csv("total.csv")
total_data = total_data.iloc[:,1:]
targets = [' rob_satety_q1', 'mur_safety_q2', 'ta_safety_q3',
           'raw_odder_q4', 'overall_q5']


In [None]:
# dataset
dataset = PCA_data[["PC1","PC2","PC3","PC4","PC5","PC6","PC7"]]

# target
question = targets[2]
dataset[question] = total_data[question]

# train & test
train, test = dataset[dataset[question].isnull() == False], dataset[dataset[question].isnull()]
print(train.shape)
print(test.shape)


In [None]:
X, Y = train.drop(question, axis = 1), train[question]
print(X.shape)
print(Y.shape)

In [None]:
#함수화
def make_PCA(question):
    dataset = PCA_data[["PC1","PC2","PC3","PC4","PC5","PC6","PC7"]]
    dataset[question] = total_data[question]
    train, test = dataset[dataset[question].isnull() == False], dataset[dataset[question].isnull()]
    X, Y = train.drop(question, axis = 1), train[question]
    return X,Y,test

targets = [' rob_satety_q1', 'mur_safety_q2', 'ta_safety_q3',
           'raw_odder_q4', 'overall_q5']

# 통합모델

In [None]:
def Three_model(X,Y):
    estimators = [XGBRegressor(random_state = 42,),
                  RandomForestRegressor(random_state = 42),
                  GradientBoostingRegressor(random_state = 42)]
    param = {"max_depth": np.random.randint(10,200,20),
             "n_estimators": np.random.randint(10,200,20)}

    results = []
    for estimator in estimators:
        if not estimator.__class__.__name__ == "XGBRegressor":
            param["max_features"] = np.random.uniform(0.3, 1.0, 100)

        result = []
        regressor = RandomizedSearchCV(
            estimator,
            param,
            n_iter = 10,
            n_jobs= -1,
            cv = 5,
            verbose = 0
        )
        regressor.fit(X,Y)
        result.append(estimator.__class__.__name__)
        result.append(regressor.best_params_)
        result.append(regressor.best_estimator_)
        result.append(regressor.best_score_)
        result.append(regressor.cv_results_)

        # validation
        pred = cross_val_predict(regressor.best_estimator_, X, Y, cv = 5)
        result.append(r2_score(Y, pred))

        # 결과 넣기
        results.append(result)

    df_cv = pd.DataFrame(results)
    df_cv.columns = ["model", "params", "estimator","r2_score","cv_result","Test값"]
    return df_cv

def find_best(df):
    return df.loc[np.argmax(df["Test값"]), "estimator"]

In [None]:
targets = [' rob_satety_q1', 'mur_safety_q2', 'ta_safety_q3',
           'raw_odder_q4', 'overall_q5']

result_df = []
best_models = []
prediction = []

for question in targets:
    print("{} Training".format(question))
    X,Y,test = make_PCA(question) # 데이터 생성
    result_df.append(Three_model(X,Y))
    best_models.append(find_best(result_df[-1])) #최고의 모델
    
    # 예측
    model = best_models[-1]
    pred = model.predict(test)
    prediction.append(pred)


In [None]:
answers = pd.DataFrame(prediction).transpose()
answers.head()

In [None]:
answers.to_csv("correct_answer.csv") 