In [1]:
#importando bibliotecas
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score

In [None]:
#importando o arquivo
df = pd.read_csv('Phishing_Legitimate_full.csv', index_col=False)
df.head()

In [None]:
#verificando as informações dos dados
df.info()

In [None]:
#feature selection
X = df.drop(['CLASS_LABEL','id'], axis=1)
y = df['CLASS_LABEL']

f_classif = SelectKBest(score_func=f_classif, k=30)
fit = f_classif.fit(X,y)
features = fit.transform(X)
cols = fit.get_support(indices=True)
df.iloc[:,cols]

In [5]:
#divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

In [6]:
#treinamento do modelo com Random Forest
modelo = RandomForestClassifier(random_state=42)

modelo.fit(X_train, y_train)

y_pred = modelo.predict(X_test)

In [None]:
#resultados
print(classification_report(y_test, y_pred))

In [None]:
#learning curve
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(), X_train, y_train, cv=3, scoring='accuracy',
                                                        n_jobs=-1, train_sizes=np.linspace(0.01,1,50), verbose=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label='Training Score')
plt.plot(train_sizes, test_mean, label='Cross-validation Score')

plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, color='#DDDDDD')
plt.fill_between(train_sizes, test_mean-test_std, test_mean+test_std, color='#DDDDDD')

plt.title('Learning Curve')
plt.xlabel('Training Size')
plt.ylabel('Accuracy Score')
plt.legend(loc = 'lower right')