In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('star_classification.csv')
#feature selection
df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','spec_obj_ID','plate','MJD','fiber_ID'], axis='columns')

#cleaning the data
df = df[(df['u'] >= 0)]
#df = df[(df['g'] >= 0)]
#df = df[(df['z'] >= 0)]

print(df.describe())
print(df.head(10))

                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean      22.080679     20.631583     19.645777     19.084865     18.768988   
std        2.251068      2.037384      1.854763      1.757900      1.765982   
min       10.996230     10.498200      9.822070      9.469903      9.612333   
25%       20.352410     18.965240     18.135795     17.732280     17.460830   
50%       22.179140     21.099930     20.125310     19.405150     19.004600   
75%       23.687480     22.123775     21.044790     20.396510     19.921120   
max       32.781390     31.602240     29.571860     32.141470     29.383740   

           redshift  
count  99999.000000  
mean       0.576667  
std        0.730709  
min       -0.009971  
25%        0.054522  
50%        0.424176  
75%        0.704172  
max        7.011245  
          u         g         r         i         z   class  redshift
0  23.87882  22.2753

In [3]:
#turning the Class values into categorical data
df=df.rename(columns = {'class':'Class'})
df.Class = df.Class.astype('category')
cat_columns = df.select_dtypes(['category']).columns
cat_columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [4]:
# raw data QDA

## train test split
X = df.drop('Class',axis='columns')
y = np.array(df['Class'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

print(y_test[:10])

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

#running QDA on raw data
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

y_pred = qda_model.predict_proba(X_test)
y_pred = np.argmax(y_pred,axis=1)

print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[1 2 2 2 0 2 2 1 0 0]
0.9455
[[11198   343    13]
 [  614  3467    28]
 [   92     0  4245]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     11554
           1       0.91      0.84      0.88      4109
           2       0.99      0.98      0.98      4337

    accuracy                           0.95     20000
   macro avg       0.95      0.93      0.94     20000
weighted avg       0.95      0.95      0.94     20000



In [5]:
#hyperparameter tuning
param_grid = {
    'reg_param': [0.0, 0.01, 0.1, 0.5, 1.0],  # higher value, higher regularization
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]           # Tolerance for stopping criterion
}

grid_search = GridSearchCV(estimator=qda_model, 
                           param_grid=param_grid, 
                           cv=3, #cross validation
                           verbose=1,
                           n_jobs=-1) 

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

#running QDA with hyperparameters 
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

y_pred = grid_search.best_estimator_.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'reg_param': 0.0, 'tol': 0.0001}
0.9455
[[11198   343    13]
 [  614  3467    28]
 [   92     0  4245]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     11554
           1       0.91      0.84      0.88      4109
           2       0.99      0.98      0.98      4337

    accuracy                           0.95     20000
   macro avg       0.95      0.93      0.94     20000
weighted avg       0.95      0.95      0.94     20000



In [6]:
#normalizing

X = df.drop('Class', axis=1)
y = df['Class']

#normalize features
scaler = preprocessing.MinMaxScaler()  #default range [0, 1]
X_normalized = scaler.fit_transform(X)

#put everything together in a pandas dataframe, list
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)
df_normalized = pd.concat([X_normalized, y.reset_index(drop=True)], axis=1)

print(df_normalized.head(10))
print(df_normalized.describe())

          u         g         r         i         z  redshift  Class
0  0.591347  0.558050  0.535344  0.427665  0.464377  0.091831      0
1  0.632603  0.584423  0.646203  0.515986  0.607035  0.112389      0
2  0.654888  0.576463  0.546218  0.435729  0.472194  0.093170      0
3  0.511384  0.629186  0.596946  0.486717  0.487460  0.134210      0
4  0.387463  0.335579  0.337999  0.287021  0.300043  0.017959      0
5  0.573420  0.608393  0.582279  0.475761  0.502398  0.204328      1
6  0.480763  0.505971  0.562346  0.491292  0.546921  0.084946      1
7  0.516570  0.546034  0.532623  0.441877  0.467223  0.069358      0
8  0.615402  0.561906  0.546246  0.440860  0.472712  0.095423      0
9  0.493476  0.451891  0.473598  0.412337  0.457321  0.001419      2
                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean       0.508807      0.480163      0.497408      0.424098      0.463126   
std 

In [8]:
# normalized data QDA

## train test split
X = df_normalized.drop('Class',axis='columns')
y = np.array(df_normalized['Class'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

#running QDA on normalized data
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

y_pred = qda_model.predict_proba(X_test)
y_pred = np.argmax(y_pred,axis=1)

print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.9455
[[11198   343    13]
 [  614  3467    28]
 [   92     0  4245]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     11554
           1       0.91      0.84      0.88      4109
           2       0.99      0.98      0.98      4337

    accuracy                           0.95     20000
   macro avg       0.95      0.93      0.94     20000
weighted avg       0.95      0.95      0.94     20000



In [9]:
#creating a subset of the training data

train, test = train_test_split(df_normalized,test_size=0.2, random_state = 42)


#subset the dataframe
train['Class'] = train['Class'].astype('category')

train['Class_code'] = train['Class'].cat.codes
sub1 = train[train['Class_code'] == train['Class'].cat.categories.get_loc(1)]

#creating a subset of galaxies sub0
n_sub1 = len(sub1)
sub0_all = train[train['Class_code'] == train['Class'].cat.categories.get_loc(0)]
sub0 = sub0_all.sample(n=n_sub1, random_state=42)  

#creating a subset of stars sub2
sub2_all = train[train['Class_code'] == train['Class'].cat.categories.get_loc(2)]
sub2 = sub2_all.sample(n=n_sub1, random_state=42)  

#merge the subsets
sub=pd.concat([sub0, sub1, sub2], axis=0)

sub_shuffle = sub.sample(frac=1, random_state=42).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Class'] = train['Class'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Class_code'] = train['Class'].cat.codes


In [10]:
#running QDA on normalized and subset data

#train test split
X_train = sub_shuffle.drop('Class',axis='columns')
X_train = X_train.drop('Class_code',axis='columns')
y_train = np.array(sub_shuffle['Class'])

X_test = test.drop('Class',axis='columns')
y_test = np.array(test['Class'])

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

#running QDA
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

y_pred = qda_model.predict_proba(X_test)
y_pred = np.argmax(y_pred,axis=1)

print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.94185
[[11050   270    10]
 [  762  3540    29]
 [   92     0  4247]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     11330
           1       0.93      0.82      0.87      4331
           2       0.99      0.98      0.98      4339

    accuracy                           0.94     20000
   macro avg       0.95      0.92      0.94     20000
weighted avg       0.94      0.94      0.94     20000

