# Random Forest Classification

## Importing the libraries

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [33]:
dataset = pd.read_csv('dataset_phishing.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [35]:
print(X_train)

[[27. 19.  0. ...  0.  0.  1.]
 [64. 17.  0. ...  0.  0.  4.]
 [76. 21.  0. ...  0.  0.  5.]
 ...
 [77.  8.  0. ...  0.  1.  1.]
 [48. 20.  0. ...  0.  1.  3.]
 [47. 22.  0. ...  0.  1.  2.]]


In [36]:
print(y_train)

['legitimate' 'legitimate' 'legitimate' ... 'phishing' 'phishing'
 'phishing']


In [37]:
print(X_test)

[[34. 25.  0. ...  0.  0.  4.]
 [54. 12.  0. ...  0.  1.  0.]
 [25. 14.  0. ...  0.  0.  0.]
 ...
 [21. 12.  0. ...  0.  0.  4.]
 [34. 25.  0. ...  0.  0.  2.]
 [34. 17.  0. ...  0.  1.  4.]]


In [38]:
print(y_test)

['legitimate' 'phishing' 'phishing' ... 'legitimate' 'legitimate'
 'legitimate']


## Feature Scaling

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [40]:
print(X_train)

[[-0.65322082 -0.20518402 -0.41973804 ... -0.14266997 -1.07388446
  -0.85869309]
 [ 0.06253906 -0.38963553 -0.41973804 ... -0.14266997 -1.07388446
   0.31914729]
 [ 0.2946774  -0.0207325  -0.41973804 ... -0.14266997 -1.07388446
   0.71176076]
 ...
 [ 0.31402226 -1.21966733 -0.41973804 ... -0.14266997  0.93119887
  -0.85869309]
 [-0.24697873 -0.11295826 -0.41973804 ... -0.14266997  0.93119887
  -0.07346617]
 [-0.26632359  0.07149325 -0.41973804 ... -0.14266997  0.93119887
  -0.46607963]]


In [41]:
print(X_test)

[[-0.51780679  0.34817052 -0.41973804 ... -0.14266997 -1.07388446
   0.31914729]
 [-0.13090956 -0.85076431 -0.41973804 ... -0.14266997  0.93119887
  -1.25130656]
 [-0.69191054 -0.66631279 -0.41973804 ... -0.14266997 -1.07388446
  -1.25130656]
 ...
 [-0.76928999 -0.85076431 -0.41973804 ... -0.14266997 -1.07388446
   0.31914729]
 [-0.51780679  0.34817052 -0.41973804 ... -0.14266997 -1.07388446
  -0.46607963]
 [-0.51780679 -0.38963553 -0.41973804 ... -0.14266997  0.93119887
   0.31914729]]


## Training the Random Forest Classification model on the Training set

In [42]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [43]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['legitimate' 'legitimate']
 ['phishing' 'phishing']
 ['legitimate' 'phishing']
 ...
 ['legitimate' 'legitimate']
 ['legitimate' 'legitimate']
 ['legitimate' 'legitimate']]


## Making the Confusion Matrix

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1387   43]
 [  75 1353]]


0.9587123862841148