In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
training = pd.read_csv("pulsar_stars.csv")
training.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [20]:
x = training.drop('target_class', axis = 1)
y = training['target_class']

In [4]:
train_train, train_test = train_test_split(training, test_size = 0.3)

train_x = train_train.drop('target_class', axis = 1)
train_y = train_train['target_class']

train_test_x = train_test.drop('target_class', axis = 1)
train_test_y = train_test['target_class'] 

In [5]:
# model
tree = DecisionTreeClassifier(criterion = "entropy")

# train
tree.fit(train_x, train_y)

# predict
tree_train_predictions = tree.predict(train_x)
tree_predictions = tree.predict(train_test_x)

from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, tree_predictions, normalize=True, sample_weight=None)
print(accuracy)
pd.DataFrame({'Gain': tree.feature_importances_}, index = train_x.columns).sort_values('Gain', ascending = False)

0.9657355679702049


Unnamed: 0,Gain
Excess kurtosis of the integrated profile,0.781162
Standard deviation of the DM-SNR curve,0.063976
Mean of the integrated profile,0.0363
Skewness of the integrated profile,0.034826
Standard deviation of the integrated profile,0.026145
Mean of the DM-SNR curve,0.024999
Skewness of the DM-SNR curve,0.01808
Excess kurtosis of the DM-SNR curve,0.014512


In [6]:
pd.crosstab(train_test_y, tree_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4769,102,4871
1,82,417,499
All,4851,519,5370


In [7]:
# model
bag = BaggingClassifier()

# train
bag.fit(train_x, train_y)

# predict
bag_predictions = bag.predict(train_test_x)

# confusion matrix

from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, bag_predictions, normalize=True, sample_weight=None)
accuracy

Predicted:     0    1   All
Actual                     
0           4829   42  4871
1             86  413   499
All         4915  455  5370


0.9761638733705773

In [14]:
pd.crosstab(train_test_y, bag_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4829,42,4871
1,86,413,499
All,4915,455,5370


In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# model
forest = RandomForestClassifier(n_estimators=100)

# use cross_val_score() to get ths scores for each train/test split
scores = cross_val_score(forest, x, y, cv = 10)
print(scores)
print()
print('Average score:', np.mean(scores))

# use cross_val_predict() to get predictions from each trani/test split if you want to see a confusion matrix
cv_predictions = cross_val_predict(forest, x, y, cv = 10)
# 
# confusion matrix
pd.crosstab(y, cv_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

[0.97821229 0.97541899 0.98044693 0.98547486 0.98379888 0.98212291
 0.97374302 0.97932961 0.98435754 0.97203579]

Average score: 0.9794940822116406


Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16155,104,16259
1,263,1376,1639
All,16418,1480,17898


In [24]:
mean_cv_predictions = cv_predictions.mean()

In [25]:
# confusion matrix
pd.crosstab(y, mean_cv_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,0.08269080344172533,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16259,16259
1,1639,1639
All,17898,17898


In [16]:
from sklearn import metrics
oob_forest = RandomForestClassifier(n_estimators = 83, criterion = 'entropy', oob_score = True,)

# train
oob_forest.fit(train_x, train_y)

# predict
forest_predictions = oob_forest.predict(train_test_x)

# feature importances
print(pd.DataFrame({'Importance': oob_forest.feature_importances_}, index = train_x.columns).sort_values('Importance', ascending = False))


# Out of bag score
print(oob_forest.oob_score_)

accuracy = metrics.accuracy_score(train_test_y, forest_predictions, normalize=True, sample_weight=None)
print(accuracy)


                                               Importance
 Excess kurtosis of the integrated profile       0.356480
 Mean of the integrated profile                  0.190171
 Skewness of the integrated profile              0.161771
 Standard deviation of the DM-SNR curve          0.074647
 Mean of the DM-SNR curve                        0.070882
 Skewness of the DM-SNR curve                    0.055130
 Excess kurtosis of the DM-SNR curve             0.045596
 Standard deviation of the integrated profile    0.045323
0.980683269476373
0.9776536312849162


In [22]:
# confusion matrix
pd.crosstab(train_test_y, forest_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4825,46,4871
1,74,425,499
All,4899,471,5370


In [18]:
# feature importances
pd.DataFrame({'Importance': oob_forest.feature_importances_}, index = train_x.columns).sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Excess kurtosis of the integrated profile,0.35648
Mean of the integrated profile,0.190171
Skewness of the integrated profile,0.161771
Standard deviation of the DM-SNR curve,0.074647
Mean of the DM-SNR curve,0.070882
Skewness of the DM-SNR curve,0.05513
Excess kurtosis of the DM-SNR curve,0.045596
Standard deviation of the integrated profile,0.045323


In [10]:
# scores = []
# for i in range(1, 101):
#     # model -- the only change is to set the 'oob_score' argument to 'True'
#     oob_forest = RandomForestClassifier(n_estimators = i, criterion = 'entropy', oob_score = True, )

#     # train
#     oob_forest.fit(x, y)

#     # Out of bag score
#     scores.append(oob_forest.oob_score_)

# scores = pd.Series(scores, index = range(1, 101))

In [11]:
# plt.plot(range(1, 101), scores)
# plt.xlabel('num estimators')
# plt.ylabel('out of bag score')
# plt.title('optimal number of estimators = ' + '83') #str(scores.idxmax()))
# plt.show()

In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=False)

clf.fit(train_x, train_y)

y_predicted = clf.predict(train_test_x)
from sklearn import metrics
accuracy = metrics.accuracy_score(train_test_y, y_predicted, normalize=True, sample_weight=None)
accuracy

0.9757914338919925

In [13]:
pd.crosstab(train_test_y, y_predicted, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4831,40,4871
1,90,409,499
All,4921,449,5370
