In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('Stars.csv')

In [3]:
data

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.002400,0.1700,16.12,Red,M,0
1,3042,0.000500,0.1542,16.60,Red,M,0
2,2600,0.000300,0.1020,18.70,Red,M,0
3,2800,0.000200,0.1600,16.65,Red,M,0
4,1939,0.000138,0.1030,20.06,Red,M,0
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,Blue,O,5
236,30839,834042.000000,1194.0000,-10.63,Blue,O,5
237,8829,537493.000000,1423.0000,-10.73,White,A,5
238,9235,404940.000000,1112.0000,-11.23,White,A,5


In [4]:
data.isnull().sum()

Temperature       0
L                 0
R                 0
A_M               0
Color             0
Spectral_Class    0
Type              0
dtype: int64

In [5]:
data.dtypes

Temperature         int64
L                 float64
R                 float64
A_M               float64
Color              object
Spectral_Class     object
Type                int64
dtype: object

In [6]:
le = LabelEncoder()
data['Color']= le.fit_transform(data['Color'])
data['Spectral_Class']= le.fit_transform(data['Spectral_Class'])

In [7]:
data['Type'].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [8]:
data

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.002400,0.1700,16.12,8,5,0
1,3042,0.000500,0.1542,16.60,8,5,0
2,2600,0.000300,0.1020,18.70,8,5,0
3,2800,0.000200,0.1600,16.65,8,5,0
4,1939,0.000138,0.1030,20.06,8,5,0
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,0,6,5
236,30839,834042.000000,1194.0000,-10.63,0,6,5
237,8829,537493.000000,1423.0000,-10.73,9,0,5
238,9235,404940.000000,1112.0000,-11.23,9,0,5


In [9]:
x = data.drop('Type', axis=1)

In [10]:
x

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class
0,3068,0.002400,0.1700,16.12,8,5
1,3042,0.000500,0.1542,16.60,8,5
2,2600,0.000300,0.1020,18.70,8,5
3,2800,0.000200,0.1600,16.65,8,5
4,1939,0.000138,0.1030,20.06,8,5
...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,0,6
236,30839,834042.000000,1194.0000,-10.63,0,6
237,8829,537493.000000,1423.0000,-10.73,9,0
238,9235,404940.000000,1112.0000,-11.23,9,0


In [11]:
y= data['Type']

In [12]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [13]:
pca = PCA()
x_1 = pca.fit_transform(x)

In [40]:
num = pca.n_components_
num

6

In [14]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.44989666, 0.24618639, 0.15804451, 0.05664699, 0.04625368,
       0.04297177])

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [16]:
classifier = RandomForestClassifier(max_depth=2, random_state=5)
classifier.fit(x_train, y_train)

In [17]:
y_pred = classifier.predict(x_test)

In [18]:
accuracy_score(y_test, y_pred)

0.9791666666666666

## With 2 Components

In [19]:
pca2 = PCA(n_components=2)
x_2 = pca2.fit_transform(x)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x_2, y, test_size=0.2, random_state=5)

In [21]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(x_train, y_train)

In [22]:
y_pred = classifier.predict(x_test)

In [23]:
accuracy_score(y_test, y_pred)

0.6875

## With 3 Components

In [24]:
pca3 = PCA(n_components=3)
x_3 = pca3.fit_transform(x)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_3, y, test_size=0.2, random_state=5)

In [26]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(x_train, y_train)

In [27]:
y_pred = classifier.predict(x_test)

In [28]:
accuracy_score(y_test, y_pred)

0.7708333333333334

## With 4 Components

In [29]:
pca4 = PCA(n_components=4)
x_4 = pca4.fit_transform(x)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_4, y, test_size=0.2, random_state=5)

In [31]:
classifier = RandomForestClassifier(max_depth=2, random_state=5)
classifier.fit(x_train, y_train)

In [32]:
y_pred = classifier.predict(x_test)

In [33]:
print('Accuracy: ' , accuracy_score(y_test, y_pred))

Accuracy:  0.8333333333333334
