In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Data Understanding**

In [None]:
pulsar_detection_data = pd.read_csv('/kaggle/input/predicting-a-pulsar-star/pulsar_stars.csv')

In [None]:
pulsar_detection_data.head()

In [None]:
# Check for positive and negative cases of pulsar detection
pulsar_detection_data['target_class'].value_counts()

In [None]:
# types of features
pulsar_detection_data.dtypes

In [None]:
# check null values
pulsar_detection_data.isnull().sum()

In [None]:
pulsar_detection_data[pulsar_detection_data['target_class']==1][' Mean of the integrated profile'].reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
pulsar_detection_data[pulsar_detection_data['target_class']==1][' Mean of the integrated profile'].reset_index(drop=True).plot(ax=ax[0], title='Pulsar', kind='hist', bins=100)
pulsar_detection_data[pulsar_detection_data['target_class']==0][' Mean of the integrated profile'].plot(ax=ax[1], title='No Pulsar', kind='hist', bins=100);

In [None]:
X = pulsar_detection_data.drop('target_class', axis=1)
y = pulsar_detection_data['target_class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

In [None]:
print(X_train.shape)
print(y_train.value_counts())
print(X_test.shape)
print(y_test.value_counts())

In [None]:
# simple random forest model without any tuning
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
def model_evaluation(y_test, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('True positives: ', tp)
    print('True negatives: ', tn)
    print('False positives: ', fp)
    print('False negatives: ', fn)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    print("f1_score: ", f1_score(y_test, y_pred))
    print("fbeta with beta of 0.5: ", fbeta_score(y_test, y_pred, beta=0.5))


In [None]:
model_evaluation(y_test, y_pred)

In [None]:
parameters = {
    'n_estimators': [10, 20, 50], 
    'criterion':['gini', 'entropy'], 
    'max_depth':[None, 1, 3, 5], 
    'min_samples_split':[2, 3, 4], 
    'min_samples_leaf':[1, 2, 3], 
#     'min_weight_fraction_leaf':0.0, 
    'max_features':['auto', 'sqrt', 'log2', None], 
#     'max_leaf_nodes':None, 
#     'min_impurity_decrease':0.0, 
#     'oob_score':[False, True], 
    'n_jobs':[-1], 
    'class_weight':[None, 'balanced', 'balanced_subsample']
}

In [None]:
model_tuned = GridSearchCV(RandomForestClassifier(), parameters, cv=3)
model_tuned.fit(X_train,y_train)
y_pred_tuned = model_tuned.predict(X_test)