In [15]:
#importing libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [16]:
#importing dataset
train = pd.read_csv('astro_train.csv')
test = pd.read_csv('astro_test.csv')

In [17]:
#data Preprocseeing

#seprating output class
y_train= train['class']
train = train.drop('class', axis= 1)

dataset = pd.concat([train, test])
dataset = dataset.drop(['id','skyVersion', 'run', 'rerun', 'camCol'], axis= 1)

#transforming err functions to log
dataset["err_g_log"] = np.log(dataset["err_g"])
dataset["err_i_log"] = np.log(dataset["err_i"])
dataset["err_r_cbrt"] = np.cbrt(dataset["err_r"])
dataset["err_u_log"] = np.log(dataset["err_u"])
dataset["err_z_log"] = np.log(dataset["err_z"])

#Transforming other columns by deviding its data into groups 
dataset["#ra_cat"] = pd.cut(dataset["#ra"],
                           bins=[0,180,220,240,np.inf],
                           labels=[1,2,3,4])

dataset["dec_cat"] = pd.cut(dataset["dec"],
                           bins=[-1.25,-1.0,-0.25,np.inf],
                           labels=[1,2,3])

dataset["extinction_r_cat"] = pd.cut(dataset["extinction_r"],
                           bins=[0,0.10,0.15,np.inf],
                           labels=[1,2,3])

dataset["field_cat"] = pd.cut(dataset["field"],
                           bins=[0,300,500,550,np.inf],
                           labels=[1,2,3,4])

dataset = dataset.drop(['err_g', 'err_i', 'err_r', 'err_u', 'err_z', '#ra', 'dec', 'extinction_r', 'field'], axis= 1)

cat_attr = ["#ra_cat","dec_cat","extinction_r_cat","field_cat"]
num_attr = [i for i in list(dataset.columns) if i not in cat_attr]

#Scaling data
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('std_scaler', StandardScaler())])

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
                ("num", num_pipeline, num_attr),
                #("cat", OneHotEncoder(), cat_attr),
                ])

transformed_dataset = full_pipeline.fit_transform(dataset)

train_prepared = transformed_dataset[:45000, :]
test_prepared = transformed_dataset[45000:, :]



In [18]:
#over sampling data using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_res, y_res = smote.fit_sample(train_prepared, y_train)

In [19]:
#creating model using XGBoost
import xgboost
from xgboost import XGBClassifier
classifier= XGBClassifier(booster= 'dart', n_estimators=300, learning_rate=0.3, gamma= 0.5)
classifier.fit(x_res, y_res)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.3, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [8]:
#predictig results
y_pre = classifier.predict(test_prepared)

In [9]:
#spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_res, y_res, test_size=0.2)

#checking accuracies using k-fold validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(classifier, X_train, Y_train, cv= 3, n_jobs= -1)

accuracies

array([0.68734762, 0.68675008, 0.68386269])