In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.pandas.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('astro_train.csv')
test = pd.read_csv('astro_test.csv')

In [3]:
X = train.drop("class", axis=1)
y = train["class"]

In [4]:
dataset = pd.concat([X, test])

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X["err_g_log"] = np.log(X["err_g"])
        X["err_i_log"] = np.log(X["err_i"])
        X["err_r_cbrt"] = np.cbrt(X["err_r"])
        X["err_u_log"] = np.log(X["err_u"])
        X["err_z_log"] = np.log(X["err_z"])

        X = X.drop(['id','skyVersion', 'camCol', 'run', 'rerun', 'err_g', 'err_i', 'err_r', 'err_u','err_z','#ra', 'field', 'obj', 'dec'], axis=1)

        return X

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

full_pipeline = Pipeline([
                ("attribs_adder", CombinedAttributesAdder()),
                ("std_scaler", StandardScaler())
                ])

In [7]:
transformed_dataset = full_pipeline.fit_transform(dataset)

In [8]:
transformed_dataset.shape

(60000, 12)

In [9]:
train_prepared = transformed_dataset[:45000, :]
test_prepared = transformed_dataset[45000:, :]

In [10]:
#over-sampling using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_res, y_res = smote.fit_sample(train_prepared, y)

Using TensorFlow backend.


In [11]:
#creating model using random forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(bootstrap=False, n_estimators = 600,criterion = 'entropy', random_state = 0)

In [12]:
#spliting data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=1234)

In [13]:
from sklearn.model_selection import cross_val_score
accurarices = cross_val_score(classifier, X_train, y_train, cv= 5)
accurarices

array([0.75823353, 0.74548177, 0.74227355, 0.74334296, 0.74762058])

In [14]:
accurarices.mean()

0.7473904773065354

In [15]:
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report 
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.70      0.75      3885
           1       0.74      0.81      0.78      3916
           2       0.76      0.78      0.77      3889

    accuracy                           0.77     11690
   macro avg       0.77      0.76      0.76     11690
weighted avg       0.77      0.77      0.76     11690



In [17]:
final_pred = classifier.predict(test_prepared)

In [18]:
sub = pd.DataFrame({"id":test["id"], "class":final_pred})
sub.to_csv("submission.csv",index=False)