In [39]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn import metrics
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, roc_auc_score

import statsmodels.api as sm

In [40]:
pd.set_option("display.max_rows",50)
df=pd.read_csv('Phishing_Legitimate_full.csv')

#koniecznie usuwamy id bo przypisuje do tej kolumny predykcje, anstenie model ma metryki 1
df.drop(['id'],axis=1,inplace=True)

In [41]:
features = df.columns.tolist()
#usunięcie zmiennej objaśnianej
features.remove('CLASS_LABEL')
features

['NumDots',
 'SubdomainLevel',
 'PathLevel',
 'UrlLength',
 'NumDash',
 'NumDashInHostname',
 'AtSymbol',
 'TildeSymbol',
 'NumUnderscore',
 'NumPercent',
 'NumQueryComponents',
 'NumAmpersand',
 'NumHash',
 'NumNumericChars',
 'NoHttps',
 'RandomString',
 'IpAddress',
 'DomainInSubdomains',
 'DomainInPaths',
 'HttpsInHostname',
 'HostnameLength',
 'PathLength',
 'QueryLength',
 'DoubleSlashInPath',
 'NumSensitiveWords',
 'EmbeddedBrandName',
 'PctExtHyperlinks',
 'PctExtResourceUrls',
 'ExtFavicon',
 'InsecureForms',
 'RelativeFormAction',
 'ExtFormAction',
 'AbnormalFormAction',
 'PctNullSelfRedirectHyperlinks',
 'FrequentDomainNameMismatch',
 'FakeLinkInStatusBar',
 'RightClickDisabled',
 'PopUpWindow',
 'SubmitInfoToEmail',
 'IframeOrFrame',
 'MissingTitle',
 'ImagesOnlyInForm',
 'SubdomainLevelRT',
 'UrlLengthRT',
 'PctExtResourceUrlsRT',
 'AbnormalExtFormActionR',
 'ExtMetaScriptLinkRT',
 'PctExtNullSelfRedirectHyperlinksRT']

In [42]:
#podział zmiennych objaśniających
rhs = "+".join(features)
rhs

'NumDots+SubdomainLevel+PathLevel+UrlLength+NumDash+NumDashInHostname+AtSymbol+TildeSymbol+NumUnderscore+NumPercent+NumQueryComponents+NumAmpersand+NumHash+NumNumericChars+NoHttps+RandomString+IpAddress+DomainInSubdomains+DomainInPaths+HttpsInHostname+HostnameLength+PathLength+QueryLength+DoubleSlashInPath+NumSensitiveWords+EmbeddedBrandName+PctExtHyperlinks+PctExtResourceUrls+ExtFavicon+InsecureForms+RelativeFormAction+ExtFormAction+AbnormalFormAction+PctNullSelfRedirectHyperlinks+FrequentDomainNameMismatch+FakeLinkInStatusBar+RightClickDisabled+PopUpWindow+SubmitInfoToEmail+IframeOrFrame+MissingTitle+ImagesOnlyInForm+SubdomainLevelRT+UrlLengthRT+PctExtResourceUrlsRT+AbnormalExtFormActionR+ExtMetaScriptLinkRT+PctExtNullSelfRedirectHyperlinksRT'

In [43]:
#wrapper walidacji krzyżowej dla modelu GLM
def CVTest(nFolds = 5, randomState=2020, debug=False):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    # Listy do zapisywania wyników
    testResults = []
    trainResults = []
    predictions = []
    indices = []
    
    for train, test in kf.split(df.index.values):
        # Estymacja modelu GLM
        mod = sm.GLM.from_formula(formula="CLASS_LABEL ~ "+rhs,
                                  data=df.iloc[train], family=sm.families.Binomial())
        res = mod.fit()
        predsTrain = res.predict()
        preds = res.predict(df.iloc[test])
        
        # Zachowajmy informacje o predykcjach dla tego foldu
        predictions.append(preds.tolist().copy())
        
        # Razem z indeksami w oryginalnym data frame
        indices.append(df.iloc[test].index.tolist().copy())
        
        # Informowanie o każdym foldzie razem z wynikami treningowymi możemy opcjonalnie wyświetlać w trakcie
        trainScore = roc_auc_score((df.CLASS_LABEL.iloc[train]==1),predsTrain)
        testScore = roc_auc_score((df.CLASS_LABEL.iloc[test]==1),preds)
        
        # Zapisanie wyników dopasowania w foldach
        trainResults.append(trainScore)
        testResults.append(testScore)
        
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)
        
    return trainResults, testResults, predictions, indices

In [44]:
#zapisanie wyników modelu logitowego 
trainResults, testResults, predictions, indices = CVTest(nFolds = 5, randomState=2020)
print(np.mean(trainResults), np.mean(testResults))
modelLogit = {
    "name":"Logit",
    "description":"Model Logit ",
    "specification":'nFolds= 5 randomState = 2020',
    "trainResults":trainResults.copy(),
    "testResults":testResults.copy(),
    "predictions":predictions.copy(),
    "indices":indices.copy(),
}


0.9855539816948031 0.9838801178113986
