In [1]:
import numpy as np
import	matplotlib.pyplot as plt
import pandas as pd

train = pd.read_csv('train.csv')
train.set_index('ID',	inplace=True)

#Magnitudes, para armar colores
H = train.loc[:,'Hmag']
J	= train.loc[:,'Jmag']
U =	train.loc[:,'umag']
G =	train.loc[:,'gmag']
r = train.loc[:,'rmag']
i = train.loc[:,'imag']
z = train.loc[:,'zmag']
Y = train.loc[:,'Ymag']
K = train.loc[:,'Kmag']
W1 =	train.loc[:,'W1mag']
W2 =	train.loc[:,'W2mag']

#Radios donde se encierra la mitad de la luz

Y_r =	train.loc[:,'Yhlr']
J_r =	train.loc[:,'Jhlr']
H_r =	train.loc[:,'Hhlr']
K_r =	train.loc[:,'Khlr']

#Magnitud en apertura fija de 3''

U3 =	train.loc[:,'u3mag']
G3 =	train.loc[:,'g3mag']
r3 =	train.loc[:,'r3mag']
i3 =	train.loc[:,'i3mag']
Z3	=	train.loc[:,'Z3mag']
Y3	=	train.loc[:,'Y3mag']
J3	=	train.loc[:,'J3mag']
H3	=	train.loc[:,'H3mag']
K3	=	train.loc[:,'K3mag']

#Coordenadas

ra = train.loc[:,'RAdeg']
dec = train.loc[:,'DEdeg']

#Clases para entrenar

clase	= train.loc[:,'Hclass'] #0: Estrella, 1: Galaxia, 2: QSO

#Tiro los Nans
mask_y	= Y_r > -98
mask_J = J_r > -98
mask_H	= H_r > -98
mask_K = K_r > -98
mask_total = mask_y & mask_J & mask_H & mask_K

train	= train[mask_total]


# Agregar columnas U-G, G-R, i-z, J3/J_r y Y3/Y_r
train['U-G'] = U - G
train['G-R'] = G - r
train['i-z'] = i - z
train['J3/J_r'] = J3 / J_r
train['Y3/Y_r'] = Y3 / Y_r

In [2]:
from sklearn.ensemble import RandomForestClassifier
from	sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report


features = ['U-G', 'G-R','i-z', 'Hmag','Jmag','umag','gmag','rmag','imag','zmag','Ymag','Kmag','W1mag','W2mag','u3mag','g3mag','r3mag','i3mag','Z3mag','Y3mag','J3mag','H3mag','K3mag'] 



df_train, df_test = train_test_split(train, test_size=0.3, random_state=42)
X_train = df_train[features]
Y_train	= df_train['Hclass']
X_test	= df_test[features]
Y_test	= df_test['Hclass']

clf = RandomForestClassifier(n_estimators=150, random_state=42,min_samples_split=15, min_samples_leaf=3,
                             max_depth=15, criterion='entropy')

#Ponemos min_samples_split que es la cantidad mínima de muestras para separar, para evitar sobreajuste.
#Min_samples_leaf es la cantidad mínima de muestras en una hoja al terminar, evitamos el sobreajuste.
#Max_depth es la profundidad máxima del árbol, para evitar el sobreajuste.

clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred, average='weighted'))

[[1624   39    7]
 [   4 7184   36]
 [   1   69  803]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1670
           1       0.99      0.99      0.99      7224
           2       0.95      0.92      0.93       873

    accuracy                           0.98      9767
   macro avg       0.98      0.96      0.97      9767
weighted avg       0.98      0.98      0.98      9767

0.9839400330909016


In [3]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='f1_weighted')
print(scores)

[0.98331368 0.98285309 0.98228984 0.98305493 0.98523249]


In [4]:
test = pd.read_csv('test.csv')
test.set_index('ID', inplace=True)

# Magnitudes, para armar colores
H = test.loc[:, 'Hmag']
J = test.loc[:, 'Jmag']
U = test.loc[:, 'umag']
G = test.loc[:, 'gmag']
r = test.loc[:, 'rmag']
i = test.loc[:, 'imag']
z = test.loc[:, 'zmag']
Y = test.loc[:, 'Ymag']
K = test.loc[:, 'Kmag']
W1 = test.loc[:, 'W1mag']
W2 = test.loc[:, 'W2mag']

# Radios donde se encierra la mitad de la luz
Y_r = test.loc[:, 'Yhlr']
J_r = test.loc[:, 'Jhlr']
H_r = test.loc[:, 'Hhlr']
K_r = test.loc[:, 'Khlr']

# Magnitud en apertura fija de 3''
U3 = test.loc[:, 'u3mag']
G3 = test.loc[:, 'g3mag']
r3 = test.loc[:, 'r3mag']
i3 = test.loc[:, 'i3mag']
Z3 = test.loc[:, 'Z3mag']
Y3 = test.loc[:, 'Y3mag']
J3 = test.loc[:, 'J3mag']
H3 = test.loc[:, 'H3mag']
K3 = test.loc[:, 'K3mag']

# Coordenadas
ra = test.loc[:, 'RAdeg']
dec = test.loc[:, 'DEdeg']

#NO TIRO LOS NANS PORQUE SINO NO ES VÁLIDO EN KAGGLE

# Agregar columnas U-G, G-R, J3/J_r y Y3/Y_r
test['U-G'] = U - G
test['G-R'] = G - r
test['J3/J_r'] = J3 / J_r
test['Y3/Y_r'] = Y3 / Y_r
test['i-z'] = i - z


#Elegir sólo las columnas que se usaron para entrenar
X_test_final = test[features]

predictions = clf.predict(X_test_final)

test['Hclass'] = predictions

test.loc[:, ['Hclass']].to_csv('submit10.csv', index=True)