# Preliminary analysis of the data. 

It is used to get a better understanding of the data, their repartition, and to get a better idea of what features to use in the model.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('../data/raw/star_classification.csv')

## Number of data and their repartition

In [4]:
#check the number of data
print(df.shape)

(100000, 18)


In [5]:
#check the number of classes in the target column
print(df['class'].value_counts())

class
GALAXY    59445
STAR      21594
QSO       18961
Name: count, dtype: int64


## Data values

In this part we will look at the values of the data, check for metrics, and see if there are any missing values, NaN or duplicates.

In [3]:
#show the data head 
print(df.head())
print("=====================================")
print(df.describe())

         obj_ID       alpha      delta         u         g         r  \
0  1.237661e+18  135.689107  32.494632  23.87882  22.27530  20.39501   
1  1.237665e+18  144.826101  31.274185  24.77759  22.83188  22.58444   
2  1.237661e+18  142.188790  35.582444  25.26307  22.66389  20.60976   
3  1.237663e+18  338.741038  -0.402828  22.13682  23.77656  21.61162   
4  1.237680e+18  345.282593  21.183866  19.43718  17.58028  16.49747   

          i         z  run_ID  rerun_ID  cam_col  field_ID   spec_obj_ID  \
0  19.16573  18.79371    3606       301        2        79  6.543777e+18   
1  21.16812  21.61427    4518       301        5       119  1.176014e+19   
2  19.34857  18.94827    3606       301        2       120  5.152200e+18   
3  20.50454  19.25010    4192       301        3       214  1.030107e+19   
4  15.97711  15.54461    8102       301        3       137  6.891865e+18   

    class  redshift  plate    MJD  fiber_ID  
0  GALAXY  0.634794   5812  56354       171  
1  GALAXY  0.77913

In [13]:
#check for missing values and duplicates
print("Missing values : ")
print(df.isnull().sum())
print("=====================================")
print(f"Duplicates : {df.duplicated().sum()}")

Missing values : 
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64
Duplicates : 0


In [7]:
#check for the number of unique values in each column
print(df.nunique())

obj_ID          78053
alpha           99999
delta           99999
u               93748
g               92651
r               91901
i               92019
z               92007
run_ID            430
rerun_ID            1
cam_col             6
field_ID          856
spec_obj_ID    100000
class               3
redshift        99295
plate            6284
MJD              2180
fiber_ID         1000
dtype: int64


In [8]:
#check for the data types
print(df.dtypes)

obj_ID         float64
alpha          float64
delta          float64
u              float64
g              float64
r              float64
i              float64
z              float64
run_ID           int64
rerun_ID         int64
cam_col          int64
field_ID         int64
spec_obj_ID    float64
class           object
redshift       float64
plate            int64
MJD              int64
fiber_ID         int64
dtype: object


### We can search for outliers 

Using knowledge on the subject, we know that galaxy can't have a redshift of 0, so we can remove those data.

In [10]:
#check the number of galaxy with a redshift of 0
df_gala = df[df['class'] == "GALAXY"]
df_qso = df[df['class'] == "QSO"]
df_star = df[df['class'] == "STAR"]
print(df_gala[df_gala['redshift'] == 0].shape[0])
print(df_qso[df_qso['redshift'] == 0].shape[0])

412
0
[-7.895373e-06  7.182029e-05 -4.285760e-04 ... -1.527468e-04 -3.949004e-04
  3.665894e-04]
