## Import the data

Use Pandas to import the data and prepare it for the models

In [4]:
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import ensemble
from sklearn.model_selection import cross_val_score


df = pd.read_csv('Dataset/dataset_phishing.csv')

#print null
print(df.isnull().sum())

#remove all null
df = df.dropna()

#describe statistical data. To stdout
with pd.option_context('display.max_columns', 40):
    print(df.describe(include='all'))

#describe statistical data to text file: out.txt
with open('out.txt', 'w') as f:
    with pd.option_context('display.max_columns', 40):
        print(df.describe(include='all'),file=f)
        
#shape of data
print('Number of rows are',df.shape[0], 'and number of columns are ',df.shape[1])

#look at data types of columns
print(df.info())

#Pair plot (takes way too long, we need to decide on important features first)
# plt.figure(figsize=(20,20))
# sns.pairplot(df)
# plt.savefig('./Figures/pairplot.png')
# plt.show()

#encode last column 
dummy_data = pd.get_dummies(df, columns = ['status'])


#Get X and y 
X = dummy_data.iloc[:,1:-1].values
y = dummy_data.iloc[:,-1].values

#try printing last column to make sure it's binary
print(y)
print(X)
print(X.shape, y.shape)

#Doing test_train split for now, later cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

print(f"X_train shape = {X_train.shape}, X_test shape = {X_test.shape}")
print(f"y_train shape = {y_train.shape}, y_test shape = {y_test.shape}")

url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64
                                                      url    length_url  \
count                                               11430  11430.000000   
unique                                              11429           NaN   
top     http://e710z0ear.du.r.appspot.com/c:/users/use...           NaN   
freq                                                    2           NaN   
mean                                                  NaN     61.126684   
std                                                   NaN     55.297318   
min                                                   NaN     12.000000   
25%                                                   NaN     33.000000   
50%                                                   NaN     47.0000

Number of rows are 11430 and number of columns are  89
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430

## Random Forest Model

In [2]:


#Feature scaling. Do we need this??
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


#Cross Validation (later)




#Random Forest


estimators = [2,5,10,100,1000,5000]
max_leaf = [5,10,20,50,100,1000]
random_state = [0,10,20,30,42]
bestModel = 0
bestScore = 0
bestParams = [0,0,0]


#will probably want to do some learning curves here 
for e in estimators:
    for m in max_leaf:
        for r in random_state:
            model = ensemble.RandomForestClassifier(n_estimators=e, max_leaf_nodes=m, random_state=r)
            model.fit(X_train,y_train.flatten())
            # it makes predictions using X_test under the hood and uses those predictions to calculate accuracy score
            score = model.score(X_test,y_test)
            print(f"Model: n_estimators = {e}, max_leaf = {m} , random = {r}, Score = {score}")
            if score > bestScore:
                bestModel = model
                bestScore = score
                bestParams = [e,m,r]
                
            

print(f"Best Model: n_estimators = {bestParams[0]}, max_leaf = {bestParams[1]} , random = {bestParams[2]}, Score = {bestScore}")


url                0
length_url         0
length_hostname    0
ip                 0
nb_dots            0
                  ..
web_traffic        0
dns_record         0
google_index       0
page_rank          0
status             0
Length: 89, dtype: int64
                                                      url    length_url  \
count                                               11430  11430.000000   
unique                                              11429           NaN   
top     http://e710z0ear.du.r.appspot.com/c:/users/use...           NaN   
freq                                                    2           NaN   
mean                                                  NaN     61.126684   
std                                                   NaN     55.297318   
min                                                   NaN     12.000000   
25%                                                   NaN     33.000000   
50%                                                   NaN     47.0000

Number of rows are 11430 and number of columns are  89
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11430 non-null  object 
 1   length_url                  11430 non-null  int64  
 2   length_hostname             11430 non-null  int64  
 3   ip                          11430 non-null  int64  
 4   nb_dots                     11430 non-null  int64  
 5   nb_hyphens                  11430 non-null  int64  
 6   nb_at                       11430 non-null  int64  
 7   nb_qm                       11430 non-null  int64  
 8   nb_and                      11430 non-null  int64  
 9   nb_or                       11430 non-null  int64  
 10  nb_eq                       11430 non-null  int64  
 11  nb_underscore               11430 non-null  int64  
 12  nb_tilde                    11430

Model: n_estimators = 5, max_leaf = 5 , random = 10, Score = 0.9070428696412949
Model: n_estimators = 5, max_leaf = 5 , random = 20, Score = 0.9017935258092739
Model: n_estimators = 5, max_leaf = 5 , random = 30, Score = 0.9825021872265967
Model: n_estimators = 5, max_leaf = 5 , random = 42, Score = 0.9897200349956256
Model: n_estimators = 5, max_leaf = 10 , random = 0, Score = 0.9864391951006124
Model: n_estimators = 5, max_leaf = 10 , random = 10, Score = 0.9321959755030621
Model: n_estimators = 5, max_leaf = 10 , random = 20, Score = 0.9639107611548556
Model: n_estimators = 5, max_leaf = 10 , random = 30, Score = 0.9932195975503062
Model: n_estimators = 5, max_leaf = 10 , random = 42, Score = 0.992344706911636
Model: n_estimators = 5, max_leaf = 20 , random = 0, Score = 0.9886264216972879
Model: n_estimators = 5, max_leaf = 20 , random = 10, Score = 0.9702537182852143
Model: n_estimators = 5, max_leaf = 20 , random = 20, Score = 0.973753280839895
Model: n_estimators = 5, max_leaf = 

Model: n_estimators = 1000, max_leaf = 50 , random = 10, Score = 1.0
Model: n_estimators = 1000, max_leaf = 50 , random = 20, Score = 1.0
Model: n_estimators = 1000, max_leaf = 50 , random = 30, Score = 1.0
Model: n_estimators = 1000, max_leaf = 50 , random = 42, Score = 1.0
Model: n_estimators = 1000, max_leaf = 100 , random = 0, Score = 1.0
Model: n_estimators = 1000, max_leaf = 100 , random = 10, Score = 1.0
Model: n_estimators = 1000, max_leaf = 100 , random = 20, Score = 1.0
Model: n_estimators = 1000, max_leaf = 100 , random = 30, Score = 1.0
Model: n_estimators = 1000, max_leaf = 100 , random = 42, Score = 1.0
Model: n_estimators = 1000, max_leaf = 1000 , random = 0, Score = 1.0
Model: n_estimators = 1000, max_leaf = 1000 , random = 10, Score = 1.0
Model: n_estimators = 1000, max_leaf = 1000 , random = 20, Score = 1.0
Model: n_estimators = 1000, max_leaf = 1000 , random = 30, Score = 1.0
Model: n_estimators = 1000, max_leaf = 1000 , random = 42, Score = 1.0
Model: n_estimators =

## Cross Validation on Random Forest Model

In [7]:
for e in estimators[:-2]:
    for m in max_leaf[:-2]:
        for r in random_state[:-2]:
            model = ensemble.RandomForestClassifier(n_estimators=e, max_leaf_nodes=m, random_state=r)
            model.fit(X_train,y_train.flatten())
            # it makes predictions using X_test under the hood and uses those predictions to calculate accuracy score
            score = cross_val_score(model, X, y.flatten(), cv=5)
            print(f"model - estimators = {e}; max_leaf = {m}; random_state = {r}:{score}")

model - estimators = 2; max_leaf = 5; random_state = 0:[1. 1. 1. 1. 1.]
model - estimators = 2; max_leaf = 5; random_state = 10:[0.90069991 0.88276465 0.89151356 0.88363955 0.88101487]
model - estimators = 2; max_leaf = 5; random_state = 20:[0.88232721 0.93088364 0.93307087 0.89720035 0.8368329 ]
model - estimators = 2; max_leaf = 10; random_state = 0:[1. 1. 1. 1. 1.]
model - estimators = 2; max_leaf = 10; random_state = 10:[0.92300962 0.91338583 0.94006999 0.93832021 0.95844269]
model - estimators = 2; max_leaf = 10; random_state = 20:[0.9208224  0.97331584 0.96631671 0.95713036 0.95800525]
model - estimators = 2; max_leaf = 20; random_state = 0:[1.         0.99912511 1.         1.         1.        ]
model - estimators = 2; max_leaf = 20; random_state = 10:[0.93657043 0.93569554 0.9536308  0.97812773 0.98775153]
model - estimators = 2; max_leaf = 20; random_state = 20:[0.9720035  0.98950131 0.98293963 0.96675416 0.96675416]
model - estimators = 2; max_leaf = 50; random_state = 0:[1. 