In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras import layers, models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.drop(['ID'],axis =1, inplace = True)
test_data.drop(['ID'],axis =1, inplace = True)
out_data = train_data[train_data['class']== -999].index
X_train = train_data.drop(out_data)

In [3]:
import numpy as np

def outliers_iqr(data):
    q1, q3 = np.percentile(data,[25,75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr*1.5)
    upper_bound = q3 + (iqr*1.5)
    
    return np.where((data> upper_bound)|(data < lower_bound))



In [4]:
ug_lead_outlier_index = outliers_iqr(X_train['u_g'])[0]
gr_lead_outlier_index = outliers_iqr(X_train['g_r'])[0]
iz_lead_outlier_index = outliers_iqr(X_train['i_z'])[0]
ri_lead_outlier_index = outliers_iqr(X_train['r_i'])[0]



In [5]:
lead_outlier_index = np.concatenate((ug_lead_outlier_index,gr_lead_outlier_index,iz_lead_outlier_index,ri_lead_outlier_index), axis = None)

print(len(lead_outlier_index))
lead_outlier_index

1373


array([  50,  164,  168, ..., 4766, 4779, 4786], dtype=int64)

In [6]:
lead_not_outlier_index = []

for i in X_train.index:
    if i not in lead_outlier_index:
        lead_not_outlier_index.append(i)

In [7]:
X_train_clean = X_train.loc[lead_not_outlier_index]

In [8]:
X_train_clean =X_train_clean.reset_index(drop = True)

In [9]:
X_train_clean

Unnamed: 0,O3_index,O2_index,sigma_star,sigma_o3,u_g,g_r,r_i,i_z,class
0,0.451072,0.937382,2.268922,2.418982,2.170366,1.061678,0.319187,0.442557,4
1,0.510605,0.151374,1.612544,1.865838,0.508636,0.128126,-0.085479,0.186809,1
2,0.599686,0.548090,2.929419,1.919131,2.159247,0.386518,0.000468,0.200131,1
3,0.604691,0.280999,2.186746,1.878498,2.077146,0.342641,0.058967,0.113909,1
4,0.179108,0.407228,2.929419,1.907366,0.531721,0.927510,0.926920,0.473041,1
...,...,...,...,...,...,...,...,...,...
3842,0.126068,0.470348,2.105984,1.954821,3.457254,0.437733,0.315954,-0.038082,1
3843,0.286460,0.595089,2.268873,2.189953,0.979456,1.156014,0.436563,0.236892,2
3844,-0.485462,0.127759,1.976665,1.881013,1.405806,0.409949,-0.717391,0.843585,1
3845,-0.383982,0.273225,2.074568,2.072062,1.793438,0.943150,0.408677,0.368323,1


In [10]:
X_train_new = X_train_clean.drop(['class'], axis=1)
y_train = X_train_clean['class']

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_new)
scaler.transform(X_train_new)
scaler.fit(test_data)
scaler.transform(test_data)

array([[ 1.52747072e-01,  1.60454967e+00, -6.39206256e-01, ...,
        -1.26987621e+00, -2.65687391e-01, -1.68594206e-01],
       [ 1.96820817e+00,  3.89088651e-01,  2.03708330e-01, ...,
         2.73327118e-01,  1.89399286e-02,  8.33234947e-02],
       [-1.44959748e+00, -1.63592840e+00,  3.14011542e-01, ...,
         7.47206376e-01,  3.37180099e-02,  1.74161616e-01],
       ...,
       [ 1.26463217e+00, -2.59773812e-01,  6.35339861e-01, ...,
         1.25411023e+00,  2.27671536e-01,  1.36708441e-01],
       [-1.18675465e+00, -1.17255651e+00, -8.23475390e-01, ...,
        -1.04407660e+00, -2.23102021e-01, -1.89598514e-01],
       [ 1.00179475e+00,  9.86949330e-01,  1.81359903e+00, ...,
        -1.22618051e+00, -1.80839144e-01, -1.96431244e-03]])

In [12]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
LE.fit(y_train)
y_train = LE.transform(y_train)

from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train)

In [15]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, # 12(=3×4)개의 하이퍼파라미터 조합을 시도합니다.
    {'n_estimators': [3, 5], 'max_features': [2, 3, 4]},     # 6(=2×3)개의 조합을 시도합니다.
  ]

model = RandomForestRegressor(random_state=42)

# 다섯 개의 폴드로 훈련하면 총 (12+6)*5=90번의 훈련이 일어납니다.
grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train_new, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'max_features': [2, 3, 4], 'n_estimators': [3, 5]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [16]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [17]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)

In [20]:
final_model = grid_search.best_estimator_

final_predictions = final_model.predict(test_data)
print(final_predictions)

[[1.         0.         0.         0.        ]
 [0.         0.         1.         0.        ]
 [0.43333333 0.56666667 0.         0.        ]
 ...
 [0.         0.         0.83333333 0.16666667]
 [1.         0.         0.         0.        ]
 [0.9        0.1        0.         0.        ]]


In [19]:
y_pred = model.predict(test_data)
print(y_pred)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [21]:
predict_list =final_predictions.argmax(axis=1)
for i in range(len(predict_list)):
    predict_list[i] += 1
print(predict_list)

[1 3 2 ... 3 1 1]


In [22]:
ID_list = pd.read_csv("test.csv")['ID']
x = [predict_list]
def submission_csv(predict_list):
    for i in range(len(x)):
        submission = pd.DataFrame({
            "ID":ID_list,
            "class":x[i]
        })   
        submission.to_csv('submission_'+'grcvoutiler'+'.csv', index= False)
        
submission_csv(x)