In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('star_classification.csv')
#feature selection
df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','spec_obj_ID','plate','MJD','fiber_ID'], axis='columns')

#cleaning the data
df = df[(df['u'] >= 0)]
#df = df[(df['g'] >= 0)]
#df = df[(df['z'] >= 0)]

print(df.describe())
print(df.head(10))

                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean      22.080679     20.631583     19.645777     19.084865     18.768988   
std        2.251068      2.037384      1.854763      1.757900      1.765982   
min       10.996230     10.498200      9.822070      9.469903      9.612333   
25%       20.352410     18.965240     18.135795     17.732280     17.460830   
50%       22.179140     21.099930     20.125310     19.405150     19.004600   
75%       23.687480     22.123775     21.044790     20.396510     19.921120   
max       32.781390     31.602240     29.571860     32.141470     29.383740   

           redshift  
count  99999.000000  
mean       0.576667  
std        0.730709  
min       -0.009971  
25%        0.054522  
50%        0.424176  
75%        0.704172  
max        7.011245  
          u         g         r         i         z   class  redshift
0  23.87882  22.2753

In [3]:
#turning the Class values into categorical data
df=df.rename(columns = {'class':'Class'})
df.Class = df.Class.astype('category')
cat_columns = df.select_dtypes(['category']).columns
cat_columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)


In [34]:
# raw data Log_Regression
## train test split
X = df.drop('Class',axis='columns')
y = np.array(df['Class'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

print(y_test[:10])

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

# running Log_Regression with raw data
lg_model = LogisticRegression(multi_class='multinomial', solver='saga')
lg_model = lg_model.fit(X_train, y_train)

y_pred = lg_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[1 2 2 2 0 2 2 1 0 0]
0.9443
[[11294   486     8]
 [  258  3319     5]
 [  352     5  4273]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     11788
           1       0.87      0.93      0.90      3582
           2       1.00      0.92      0.96      4630

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.95      0.94      0.94     20000





In [39]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.1, 1, 10, 100, 1000],
    'solver': ['saga'],  #saga solver for multiclass and elasticnet
    'l1_ratio': [0.1, 0.5, 0.7],
    'max_iter': [500, 1000, 2000]
}

grid_search = GridSearchCV(estimator=lg_model, 
                           param_grid=param_grid, 
                           cv=3, #cross validation
                           verbose=1,
                           n_jobs=-1) 

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

y_pred = grid_search.best_estimator_.predict(X_test)
y_pred = lg_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

Fitting 3 folds for each of 45 candidates, totalling 135 fits




{'C': 1000, 'l1_ratio': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}
0.9299
[[11264   492   256]
 [  227  3305     1]
 [  413    13  4029]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     12012
           1       0.87      0.94      0.90      3533
           2       0.94      0.90      0.92      4455

    accuracy                           0.93     20000
   macro avg       0.92      0.93      0.92     20000
weighted avg       0.93      0.93      0.93     20000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
#normalizing

X = df.drop('Class', axis=1)
y = df['Class']

# normalize features
scaler = preprocessing.MinMaxScaler()  #default range [0, 1]
X_normalized = scaler.fit_transform(X)

#put everything together in a pandas dataframe, list
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)
df_normalized = pd.concat([X_normalized, y.reset_index(drop=True)], axis=1)

print(df_normalized.head(10))
print(df_normalized.describe())

          u         g         r         i         z  redshift  Class
0  0.591347  0.558050  0.535344  0.427665  0.464377  0.091831      0
1  0.632603  0.584423  0.646203  0.515986  0.607035  0.112389      0
2  0.654888  0.576463  0.546218  0.435729  0.472194  0.093170      0
3  0.511384  0.629186  0.596946  0.486717  0.487460  0.134210      0
4  0.387463  0.335579  0.337999  0.287021  0.300043  0.017959      0
5  0.573420  0.608393  0.582279  0.475761  0.502398  0.204328      1
6  0.480763  0.505971  0.562346  0.491292  0.546921  0.084946      1
7  0.516570  0.546034  0.532623  0.441877  0.467223  0.069358      0
8  0.615402  0.561906  0.546246  0.440860  0.472712  0.095423      0
9  0.493476  0.451891  0.473598  0.412337  0.457321  0.001419      2
                  u             g             r             i             z  \
count  99999.000000  99999.000000  99999.000000  99999.000000  99999.000000   
mean       0.508807      0.480163      0.497408      0.424098      0.463126   
std 

In [30]:
# normalized data Log_Regression

## train test split
X = df_normalized.drop('Class',axis='columns')
y = np.array(df_normalized['Class'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

# running Log_Regression with normalized data
lg_model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='elasticnet', l1_ratio = 0.5, C=0.9)
lg_model = lg_model.fit(X_train, y_train)

y_pred = lg_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.93795
[[11291   487   125]
 [  223  3311     4]
 [  390    12  4157]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     11903
           1       0.87      0.94      0.90      3538
           2       0.97      0.91      0.94      4559

    accuracy                           0.94     20000
   macro avg       0.93      0.93      0.93     20000
weighted avg       0.94      0.94      0.94     20000



In [8]:
#creating a subset of the training data

train, test = train_test_split(df_normalized,test_size=0.2, random_state = 42)

#subset the dataframe
train['Class'] = train['Class'].astype('category')

train['Class_code'] = train['Class'].cat.codes
sub1 = train[train['Class_code'] == train['Class'].cat.categories.get_loc(1)]

#creating a subset of galaxies sub0
n_sub1 = len(sub1)
sub0_all = train[train['Class_code'] == train['Class'].cat.categories.get_loc(0)]
sub0 = sub0_all.sample(n=n_sub1, random_state=42)  

#creating a subset of stars sub2
sub2_all = train[train['Class_code'] == train['Class'].cat.categories.get_loc(2)]
sub2 = sub2_all.sample(n=n_sub1, random_state=42)  

#merge the subsets
sub=pd.concat([sub0, sub1, sub2], axis=0)

sub_shuffle = sub.sample(frac=1, random_state=42).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Class'] = train['Class'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Class_code'] = train['Class'].cat.codes


In [9]:
# running Log_Regression with normalized and subseted data

#train test split
X_train = sub_shuffle.drop('Class',axis='columns')
X_train = X_train.drop('Class_code',axis='columns')
y_train = np.array(sub_shuffle['Class'])

X_test = test.drop('Class',axis='columns')
y_test = np.array(test['Class'])

#array dimensions
y_train = y_train.transpose()
y_test = y_test.transpose()

#running Log_Reg
lg_model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='elasticnet', l1_ratio = 0.5, C=0.9)
lg_model = lg_model.fit(X_train, y_train)

y_pred = lg_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.8816
[[9872  306   10]
 [ 634 3485    1]
 [1398   19 4275]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.89     10188
           1       0.91      0.85      0.88      4120
           2       1.00      0.75      0.86      5692

    accuracy                           0.88     20000
   macro avg       0.91      0.86      0.88     20000
weighted avg       0.89      0.88      0.88     20000

