In [24]:
import pandas as pd
from pandas import DataFrame

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the Data obtained form 01_Data_Analysis

In [25]:
path: str = r"D:\Documents\GitHub\UNI_Stellar_Classification\Data\star_classification_only_features.csv"
data: DataFrame = pd.read_csv(path, index_col="index")
data.drop(["Unnamed: 0"], axis=1, inplace=True)

In [26]:
data.head()

Unnamed: 0_level_0,u,g,r,i,z,redshift,plate,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,23.87882,22.2753,20.39501,19.16573,18.79371,0.634794,5812,GALAXY
1,24.77759,22.83188,22.58444,21.16812,21.61427,0.779136,10445,GALAXY
2,25.26307,22.66389,20.60976,19.34857,18.94827,0.644195,4576,GALAXY
3,22.13682,23.77656,21.61162,20.50454,19.2501,0.932346,9149,GALAXY
4,19.43718,17.58028,16.49747,15.97711,15.54461,0.116123,6121,GALAXY


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99999 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   u         99999 non-null  float64
 1   g         99999 non-null  float64
 2   r         99999 non-null  float64
 3   i         99999 non-null  float64
 4   z         99999 non-null  float64
 5   redshift  99999 non-null  float64
 6   plate     99999 non-null  int64  
 7   class     99999 non-null  object 
dtypes: float64(6), int64(1), object(1)
memory usage: 6.9+ MB


# Transform the target variable into numeric values

In [28]:
target_encoding: dict = {
    "GALAXY": 0,
    "QSO": 1,
    "STAR": 2
}

data["target"] = data["class"].map(lambda x: target_encoding[x])
data.target.unique()

array([0, 1, 2], dtype=int64)

# Split the data into train and test samples
before further preprocessing steps are applied, the data will be split into a training and testing set.
The preprocessing is then performed on both sets by only taking information from the training set.

# Train - Test Split
the train split will be used for the hyperparameter tuning while the test split will only be used to evaluate the model configurations that performed best on the given training data.

The train test split will be performed in a stratified way which ensures, that the distribution of the target variable will be kept as in the original data. That avoids situation of an "unlucky" data split for the evaluation data.

In [29]:
labels: DataFrame = data["target"]

x_train, x_test, y_train, y_test = train_test_split(
    data.drop(["class", "target"], axis=1),
    labels,
    train_size=.75,
    test_size=.25,
    shuffle=True,
    stratify=labels,
    random_state=1337
)

print(f"num training samples: {len(x_train)}, shape: {x_train.shape}")
print(f"num testing samples:  {len(x_test)}, shape: {x_test.shape}")

num training samples: 74999, shape: (74999, 7)
num testing samples:  25000, shape: (25000, 7)


# Rescale all features
For Trees and Ensemble methods this will not be necessary, however it can be beneficial for Logistic Regression Models and SVM Classifier.
Thus, a rescaled and non rescaled version of the dataset will be saved.

There are several methods how to rescale data such as Standardization or MinMaxScaling
Since it was observed that none of the features tends to be really close to a normal distribution,
standardization does not seem fit well for this data. Thus, MinMaxScaling is used

In [30]:
min_max_scaler = MinMaxScaler()

# min max scale
mm_scaled_train_data: DataFrame = DataFrame(
    data=min_max_scaler.fit_transform(x_train, y_train),
    columns=min_max_scaler.get_feature_names_out(),
    index=x_train.index
)
mm_scaled_test_data: DataFrame = DataFrame(
    data=min_max_scaler.transform(x_test),
    columns=min_max_scaler.get_feature_names_out(),
    index=x_test.index
)

mm_scaled_train_data["target"] = y_train
mm_scaled_test_data["target"] = y_test

print(f"train data =    {len(mm_scaled_train_data)}")
print(f"test data =     {len(mm_scaled_test_data)}")

train data =    74999
test data =     25000


In [31]:
mm_scaled_train_data.head()

Unnamed: 0_level_0,u,g,r,i,z,redshift,plate,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
39953,0.483393,0.397927,0.41027,0.384353,0.419801,0.029955,0.098526,0
25395,0.525559,0.507477,0.558333,0.548897,0.598689,0.17397,0.608338,1
62572,0.625665,0.552068,0.601967,0.581113,0.637224,0.001334,0.575523,2
58556,0.636801,0.588583,0.62747,0.609505,0.729539,0.039602,0.69408,0
67656,0.487465,0.47856,0.544779,0.53118,0.589709,0.29218,0.660777,1


In [32]:
x_train["target"] = y_train
x_test["target"] = y_test

x_train.head()

Unnamed: 0_level_0,u,g,r,i,z,redshift,plate,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
39953,20.50174,18.89606,17.92482,17.45683,17.24972,0.200347,1476,0
25395,21.3309,21.20801,20.84902,20.87608,20.50419,1.211507,7737,1
62572,23.2994,22.14906,21.7108,21.54554,21.20525,-0.000603,7334,2
58556,23.51839,22.91968,22.21448,22.13553,22.88474,0.268084,8790,0
67656,20.58182,20.59775,20.58134,20.50792,20.34083,2.04149,8381,1


# Save Data
One Dataset contains the original values, while the other one contains the min-max-scaled data

In [33]:
save_path_train_data: str = r"D:\Documents\GitHub\UNI_Stellar_Classification\Data\star_classification_preprocessed_train_data.csv"
save_path_test_data: str = r"D:\Documents\GitHub\UNI_Stellar_Classification\Data\star_classification_preprocessed_test_data.csv"
x_train.to_csv(save_path_train_data)
x_test.to_csv(save_path_test_data)

In [34]:
save_path_train_data_mm_scale: str = r"D:\Documents\GitHub\UNI_Stellar_Classification\Data\star_classification_preprocessed_train_data_min_max_scale.csv"
save_path_test_data_mm_scale: str = r"D:\Documents\GitHub\UNI_Stellar_Classification\Data\star_classification_preprocessed_test_data_min_max_scale.csv"

mm_scaled_train_data.to_csv(save_path_train_data_mm_scale)
mm_scaled_test_data.to_csv(save_path_test_data_mm_scale)