In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
%matplotlib inline
# Evaluations
from sklearn.metrics import classification_report,confusion_matrix
# Random Forest
from sklearn.ensemble import RandomForestClassifier


In [None]:
pulsar_Data = pd.read_csv('/kaggle/input/predicting-a-pulsar-star/pulsar_stars.csv')

In [None]:
pulsar_Data.head()

In [None]:
pulsar_Data.info()

In [None]:
pulsar_Data.isnull().any()

In [None]:
#corelation plot
plt.figure(figsize=(12,5))
sns.heatmap(pulsar_Data.corr(),annot=True)

In [None]:
sns.pairplot(pulsar_Data,hue='target_class')

In [None]:
#Count plot of target_class
sns.countplot(x='target_class',data=pulsar_Data)


In [None]:
pulsar_Data['target_class'].value_counts()


In [None]:
# Creating data frame with features dropping feature column.

df=pulsar_Data.copy() #deep copy

def data_prep(df):
   feature_columns = df.columns[:-1]
   df_features = pd.DataFrame(data=df, columns=feature_columns)
   return df_features

In [None]:
df_features=data_prep(df)
df_features.head()

In [None]:
# Spiting the data to train and test the model
X = df_features.copy()
y = df['target_class'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Standardising 

def standardScaling(feature):
   scaler = StandardScaler().fit(feature)
   scaled_feature = scaler.transform(feature)
   scaled_feat = pd.DataFrame(data = scaled_feature, columns = df_features.columns)
   return scaled_feat

X_train_scaled = standardScaling(X_train)
X_test_scaled = standardScaling(X_test)

# To avoid index mismatch error, reset the index of the target class.

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Applying Random Forest

To have the best parameters amid wide range of parameter values for each hyperparameter, we can randomly sample by using K-Fold CV with each combination of parameter values .

In [None]:
rf = RandomForestClassifier(random_state=22)
from pprint import pprint

# printing out the parameter of the RFC

pprint(rf.get_params())

Creating a parameter grid to apply RandomizedSearchCV.Here we randomly sample wide range of values .

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=22, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train_scaled, y_train)

In [None]:
# Now here we can see the best parameters from fitting the randomized search.
rf_random.best_params_

[](http://)From the above result, we can narrow down the range of values for each hyperparameter.Now we can choose values for the parameters as indicated by the random search .We do this by using GridSearchCV.

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid_1 = {
    'bootstrap': [True],
    'max_depth': [35,55,70],
    'max_features': [2, 3],
    'min_samples_leaf': [1,3],
    'min_samples_split': [8,12,15],
    'n_estimators': [ 250,320,750]
}
    
    
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid_1,cv = 3, n_jobs = -1, verbose = 2)
rf_random.fit(X_train_scaled, y_train)

In [None]:
rf_random.best_params_

Now we can use the parameters value into the model for prediction

# Prediction and Evaluation

In [None]:
rfc=RandomForestClassifier(n_estimators= 1788,
 min_samples_split= 10,
 min_samples_leaf=1,
 max_features = 'auto',
 max_depth =110,
 bootstrap = True)

rfc.fit(X_train_scaled, y_train)
rfc_predict = rfc.predict(X_test_scaled )

Evaluation

In [None]:


from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, rfc_predict)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');



In [None]:
from sklearn import metrics
print(metrics.classification_report(rfc_predict, y_test))

In [None]:
from sklearn.model_selection import cross_val_score

rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

1.  # Precision Recall Curve

The precision and recall can be calculated for thresholds using the precision_recall_curve() function that takes the true output values and the probabilities for the positive class as output and returns the precision, recall and threshold values.

In [None]:
from sklearn.metrics import precision_recall_curve

ypred = rfc.predict_proba(X_test_scaled)

# keep probabilities for the positive outcome only
probs = ypred[:, 1]

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt

# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, probs)

# calculate F1 score
f1 = f1_score(y_test, rfc_predict)

# calculate precision-recall AUC
auc = auc(recall, precision)

# calculate average precision score
ap = average_precision_score(y_test, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the precision-recall curve for the model
plt.plot(recall, precision, marker='.',label='auc=0.952')
# show the plot
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    
    plt.figure(figsize=(8, 8))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')

### Another way to view the trade off between precision and recall is to plot them together as a function of the decision threshold. Helps in  tuning the threshold until there are 0 False Negatives.

In [None]:
plot_precision_recall_vs_threshold( precision,recall, thresholds)