In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
training = pd.read_csv("pulsar_stars.csv")
training.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
train_train, train_test = train_test_split(training, test_size = 0.2)

train_x = train_train.drop('target_class', axis = 1)
train_y = train_train['target_class']

train_test_x = train_test.drop('target_class', axis = 1)
train_test_y = train_test['target_class'] 

In [5]:
# model
tree = DecisionTreeClassifier(criterion = "entropy")

# train
tree.fit(train_x, train_y)

# predict
tree_predictions = tree.predict(train_test_x)
print(pd.crosstab(train_test_y, tree_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True))
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, tree_predictions, normalize=True, sample_weight=None)
print(accuracy)
pd.DataFrame({'Gain': tree.feature_importances_}, index = train_x.columns).sort_values('Gain', ascending = False)

Predicted:     0    1   All
Actual                     
0           3204   61  3265
1             53  262   315
All         3257  323  3580
0.9681564245810056


Unnamed: 0,Gain
Excess kurtosis of the integrated profile,0.780475
Standard deviation of the DM-SNR curve,0.070435
Excess kurtosis of the DM-SNR curve,0.042389
Standard deviation of the integrated profile,0.037241
Skewness of the integrated profile,0.035683
Mean of the integrated profile,0.033776


In [6]:
# model
bag = BaggingClassifier()

# train
bag.fit(train_x, train_y)

# predict
bag_predictions = bag.predict(train_test_x)

# confusion matrix
print(pd.crosstab(train_test_y, bag_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True))
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, bag_predictions, normalize=True, sample_weight=None)
accuracy

Predicted:     0    1   All
Actual                     
0           3240   25  3265
1             58  257   315
All         3298  282  3580


0.9768156424581006

In [7]:
forest = RandomForestClassifier(criterion = 'entropy')

# train
forest.fit(train_x, train_y)

# predict
forest_predictions = forest.predict(train_test_x)

# feature importances
print(pd.DataFrame({'Importance': forest.feature_importances_}, index = train_x.columns).sort_values('Importance', ascending = False))

# confusion matrix
print(pd.crosstab(train_test_y, forest_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True))
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, forest_predictions, normalize=True, sample_weight=None)
accuracy

                                               Importance
 Excess kurtosis of the integrated profile       0.336231
 Skewness of the integrated profile              0.219322
 Mean of the integrated profile                  0.175177
 Excess kurtosis of the DM-SNR curve             0.113362
 Standard deviation of the DM-SNR curve          0.099249
 Standard deviation of the integrated profile    0.056660
Predicted:     0    1   All
Actual                     
0           3244   21  3265
1             60  255   315
All         3304  276  3580


0.9773743016759776

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=False)

clf.fit(train_x, train_y)

y_predicted = clf.predict(train_test_x)
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, y_predicted, normalize=True, sample_weight=None)
accuracy

0.9776536312849162