# Import Data


###  Derived From: https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star


In [24]:
# Load scikit's random forest classifier library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
# from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict


# Finding the Target Class

In [25]:
training = pd.read_csv("pulsar_stars.csv")
# print(training)
X = training.drop("target_class", 1)
y = training["target_class"]
training.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [10]:
# Test train method
# train_train, train_test = train_test_split(training, test_size = 0.2)

# train_x = train_train.drop(["target_class"], axis = 1)
# train_y = train_train["target_class"]

# train_test_x = train_test.drop("target_class", 1)
# train_test_y = train_test["target_class"]

# Decision Trees

In [29]:
# model
tree = DecisionTreeClassifier(criterion = "entropy")

# train
tree.fit(X, y)

# predict
tree_predictions = tree.predict(X)
print(pd.crosstab(y, tree_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True))

# Accuracy
from sklearn import metrics
accuracy = metrics.accuracy_score(y, tree_predictions, normalize=True, sample_weight=None)
print(accuracy)

# Find Gains
pd.DataFrame({'Gain': tree.feature_importances_}, index = X.columns).sort_values('Gain', ascending = False)

Predicted:      0     1    All
Actual                        
0           16259     0  16259
1               0  1639   1639
All         16259  1639  17898
1.0


Unnamed: 0,Gain
Excess kurtosis of the integrated profile,0.780943
Standard deviation of the DM-SNR curve,0.065022
Mean of the integrated profile,0.038374
Skewness of the integrated profile,0.032142
Skewness of the DM-SNR curve,0.022566
Excess kurtosis of the DM-SNR curve,0.022541
Standard deviation of the integrated profile,0.02099
Mean of the DM-SNR curve,0.017421


# Bagging

In [30]:
# model
bag = BaggingClassifier()

# train
bag.fit(train_x, train_y)

# predict
bag_predictions = bag.predict(train_test_x)

# confusion matrix
pd.crosstab(train_test_y, bag_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)


Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3243,32,3275
1,43,262,305
All,3286,294,3580


In [31]:
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, bag_predictions, normalize=True, sample_weight=None)
accuracy

0.979050279329609

# Random Forests

In [33]:
# model
forest = RandomForestClassifier(n_estimators=20)

# use cross_val_score() to get ths scores for each train/test split
scores = cross_val_score(forest, X, y, cv = 10)
print(scores)
print()
print('Average score:', np.mean(scores))

# use cross_val_predict() to get predictions from each trani/test split if you want to see a confusion matrix
cv_predictions = cross_val_predict(forest, X, y, cv = 10)

# confusion matrix
pd.crosstab(y, cv_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

forest = RandomForestClassifier(criterion = 'entropy')

# train
forest.fit(X, y)

# predict
forest_predictions = forest.predict(X)

# confusion matrix
pd.crosstab(y, forest_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

# View a list of the features and their importance scores
# list(zip(train[features], clf.feature_importances_))

# model -- the only change is to set the 'oob_score' argument to 'True'
oob_forest = RandomForestClassifier(oob_score = True, n_estimators = 100)

# train
oob_forest.fit(X, y)

# Out of bag score
oob_forest.oob_score_

[0.97821229 0.97597765 0.97877095 0.98268156 0.98212291 0.97932961
 0.97486034 0.97877095 0.98324022 0.97147651]

Average score: 0.9785442990514042


0.9798301486199575

In [35]:
# Accuracy

from sklearn import metrics
accuracy = metrics.accuracy_score(y, forest_predictions, normalize=True, sample_weight=None)
accuracy

0.9976533690915186

# Out of Bag Error on a Random Forest

In [36]:
# model -- the only change is to set the 'oob_score' argument to 'True'
oob_forest = RandomForestClassifier(oob_score = True, n_estimators = 100)

# train
oob_forest.fit(X, y)

# Out of bag score
oob_forest.oob_score_

0.9803329981003464

# Logistic Regression

In [38]:
# Logistic Regression can be validated
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=False)

clf.fit(X, y)

y_predicted = clf.predict(X)

In [39]:
# Accuracy

from sklearn import metrics
accuracy = metrics.accuracy_score(y, y_predicted, normalize=True, sample_weight=None)
accuracy

0.9786568331657168

# Optimize for Specificity

# Optimize for Sensivity