In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from sklearn.impute import SimpleImputer


In [4]:
data_1 = pd.read_csv('anomaly_probs.csv')

In [5]:
data_1.columns

Index(['Unnamed: 0', 'day', 'dns_qdomainname', 'num_clients', 'num_responses',
       'anomaly', 'shieldid_count', 'rank', 'num_shieldid', 'num_dns_qname',
       'client_sum_group', 'responses_sum_group', 'shield_sum_group',
       'prob_obs_num_clients', 'prob_obs_num_responses', 'prob_obs_num_shield',
       'total_prob', 'month', '0_x', 'normalized_prob_responses', '0_y',
       'normalized_prob_clients', '0', 'normalized_prob_shields',
       'normalized_total_prob'],
      dtype='object')

In [6]:
data_2 = pd.read_csv('data_small_update.csv')

In [7]:
len(data_1)

781111

In [8]:
len(data_2)

8267214

In [9]:
probs=data_1[['day','dns_qdomainname','normalized_prob_responses','normalized_prob_clients','normalized_prob_shields','normalized_total_prob']]

In [10]:
data = pd.merge(probs, data_2, on=['day','dns_qdomainname'],how='right')

In [20]:
#data.to_csv('Reg_data')

## RF Regression Pipeline

In [11]:
data.columns

Index(['day', 'dns_qdomainname', 'normalized_prob_responses',
       'normalized_prob_clients', 'normalized_prob_shields',
       'normalized_total_prob', 'Unnamed: 0', 'shieldid', 'dns_qname',
       'num_clients', 'num_responses', 'timefirstseen', 'timelastseen',
       'anomaly', 'shieldid_count', 'rank', 'domain', 'created', 'updated',
       'expires', 'whois', 'timeseen', 'anomaly_2', 'tld', 'mal_tld',
       'days_existed', 'month_firstseen', 'day_firstseen', 'hour_firstseen',
       'month_lastseen', 'day_lastseen', 'hour_lastseen', 'day_created',
       'month_created', 'day_expires', 'month_expires'],
      dtype='object')

In [12]:
X = data.drop(['day','anomaly','anomaly_2','dns_qname','dns_qdomainname','timefirstseen','timelastseen','domain','created','updated','expires','Unnamed: 0','normalized_prob_responses','normalized_prob_clients','normalized_prob_shields','normalized_total_prob'],axis=1)
y = data['normalized_total_prob']

In [13]:
X.columns

Index(['shieldid', 'num_clients', 'num_responses', 'shieldid_count', 'rank',
       'whois', 'timeseen', 'tld', 'mal_tld', 'days_existed',
       'month_firstseen', 'day_firstseen', 'hour_firstseen', 'month_lastseen',
       'day_lastseen', 'hour_lastseen', 'day_created', 'month_created',
       'day_expires', 'month_expires'],
      dtype='object')

In [14]:
numerical_feats = ['num_clients', 'num_responses','shieldid_count','rank','timeseen',
                   'days_existed','day_firstseen','hour_firstseen','month_firstseen', 
                   'month_lastseen', 'day_lastseen','hour_lastseen', 'day_created',
                   'month_created', 'day_expires','month_expires']
categorical_feats = ['shieldid','whois','tld','mal_tld']

In [15]:
num_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler())  
])

cat_processor = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy = 'constant')),
     ('encoder', OneHotEncoder())   
 ])

In [16]:
feat_processor = ColumnTransformer(transformers = [
    ('num_pipe', num_processor,numerical_feats),
    ('cat_pipe', cat_processor, categorical_feats)
],remainder='drop')

In [17]:
Regressor = RandomForestRegressor()

In [18]:
pipe = Pipeline(steps=[
    ('feature_processor', feat_processor),
    ('reg', Regressor)
])
pipe

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
pipe.fit(X_train,y_train)