In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.pandas.set_option('display.max_columns', None)

In [3]:
train = pd.read_csv("astro_train.csv")
test = pd.read_csv("astro_test.csv")

In [4]:
train.shape

(45000, 22)

In [5]:
test.shape

(15000, 21)

In [6]:
result_ids = test["id"]

In [7]:
result_ids

0        45000
1        45001
2        45002
3        45003
4        45004
         ...  
14995    59995
14996    59996
14997    59997
14998    59998
14999    59999
Name: id, Length: 15000, dtype: int64

In [8]:
labels = train["class"]
train = train.drop("class", axis=1)

In [9]:
dataset = [train, test]
dataset = pd.concat(dataset)

In [10]:
dataset["err_g_log"] = np.log(dataset["err_g"])
dataset["err_i_log"] = np.log(dataset["err_i"])
dataset["err_r_cbrt"] = np.cbrt(dataset["err_r"])
dataset["err_u_log"] = np.log(dataset["err_u"])
dataset["err_z_log"] = np.log(dataset["err_z"])

In [11]:
dataset["#ra_cat"] = pd.cut(dataset["#ra"],
                           bins=[0,180,220,240,np.inf],
                           labels=[1,2,3,4])

In [12]:
dataset["dec_cat"] = pd.cut(dataset["dec"],
                           bins=[-1.25,-1.0,-0.25,np.inf],
                           labels=[1,2,3])

In [13]:
dataset["extinction_r_cat"] = pd.cut(dataset["extinction_r"],
                           bins=[0,0.10,0.15,np.inf],
                           labels=[1,2,3])

In [14]:
dataset["field_cat"] = pd.cut(dataset["field"],
                           bins=[0,300,500,550,np.inf],
                           labels=[1,2,3,4])

In [15]:
dataset = dataset.drop(["err_i","err_z","err_u","err_g","err_r","skyVersion", "rerun", "#ra","dec","extinction_r","field", "run", "camCol"],axis=1)

In [16]:
dataset = dataset.drop("id",axis=1)

In [17]:
cat_attr = ["#ra_cat","dec_cat","extinction_r_cat","field_cat"]

In [18]:
num_attr = [i for i in list(dataset.columns) if i not in cat_attr]

In [19]:
num_attr

['dered_i',
 'dered_z',
 'dered_u',
 'dered_g',
 'dered_r',
 'obj',
 'photoz',
 'err_g_log',
 'err_i_log',
 'err_r_cbrt',
 'err_u_log',
 'err_z_log']

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('std_scaler', StandardScaler())])

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
                ("num", num_pipeline, num_attr),
                ("cat", OneHotEncoder(), cat_attr),
                ])

In [22]:
transformed_dataset = full_pipeline.fit_transform(dataset)

In [23]:
train_prepared = transformed_dataset[:45000, :]

In [24]:
test_prepared = transformed_dataset[45000:, :]

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_prepared, labels, test_size=0.2, stratify=labels)

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score

In [27]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [28]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.73      0.77      0.75      3896
           1       0.61      0.67      0.64      2691
           2       0.64      0.51      0.57      2413

    accuracy                           0.67      9000
   macro avg       0.66      0.65      0.65      9000
weighted avg       0.67      0.67      0.67      9000



In [29]:
cross_val_score(rfc, X_train, y_train, cv=3, scoring="accuracy")

array([0.66916667, 0.67025   , 0.67266667])

In [30]:
from sklearn.linear_model import SGDClassifier

In [31]:
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))

In [32]:
cross_val_score(ovo_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.601     , 0.60541667, 0.61183333])

In [33]:
sgd = SGDClassifier(random_state=42)
cross_val_score(sgd, X_train, y_train, cv=3, scoring="accuracy")

array([0.58208333, 0.58875   , 0.59291667])

In [34]:
svc = SVC()
cross_val_score(svc, X_train, y_train, cv=3, scoring="accuracy")

array([0.659     , 0.65783333, 0.65991667])

In [35]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier

In [36]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
cross_val_score(adaDTC, X_train, y_train, cv=3, scoring="accuracy")

array([0.56758333, 0.57116667, 0.56541667])

In [37]:
ExtC = ExtraTreesClassifier()
cross_val_score(ExtC, X_train, y_train, cv=3, scoring="accuracy")

array([0.65525   , 0.65741667, 0.66116667])

In [38]:
GBC = GradientBoostingClassifier()
cross_val_score(GBC, X_train, y_train, cv=3, scoring="accuracy")

array([0.65266667, 0.65525   , 0.65683333])

In [39]:
votingC = VotingClassifier(estimators=[('rfc', rfc), ('extc', ExtC),
('svc', svc),('gbc',GBC)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, y_train)

In [28]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
cross_val_score(xgb, X_train, y_train, cv=3, scoring="accuracy")

array([0.66966667, 0.65933333, 0.66716667])

In [29]:
model = XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)
cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy")

array([0.65575   , 0.64616667, 0.65141667])