In [2]:
import pandas as pd
import numpy as np

In [36]:
data = pd.read_parquet('data/processed/train.parquet')
data['label'].value_counts()

 1    300000
 0    250000
-1    200000
Name: label, dtype: int64

In [37]:
# fill malware class with malicious, where avclass is available
data.loc[(data['avclass']!='') & (data['label']==-1),'label'] = 1
data['label'].value_counts()

 1    396433
 0    250000
-1    103567
Name: label, dtype: int64

In [38]:
malware_df = data[data['label']==1]
malware_df.loc[malware_df['avclass']== '', 'avclass'] = 'unknown'

In [39]:
malware_df['avclass'].value_counts()

xtrat             22284
installmonster    21864
zusy              18597
vtflooder         16132
zbot              14363
                  ...  
killnotes             1
bagif                 1
magnat                1
ciyaamgnrsg           1
predatorthief         1
Name: avclass, Length: 3069, dtype: int64

In [29]:

# get avlcasses with more than 10,000 samples
top_avclasses = malware_df['avclass'].value_counts()
top_avclasses = top_avclasses[top_avclasses>10000]
# change all other avclasses to 'other'
malware_df.loc[:,'avclass'] = malware_df['avclass'].apply(lambda x: x if x in top_avclasses else 'other')
malware_df['avclass'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malware_df.loc[:,'avclass'] = malware_df['avclass'].apply(lambda x: x if x in top_avclasses else 'other')


other             220086
xtrat              22284
installmonster     21864
zusy               18597
vtflooder          16132
zbot               14363
fareit             13701
ramnit             13679
sality             12700
adposhel           11375
unknown            10754
emotet             10720
high               10178
Name: avclass, dtype: int64

In [23]:
NUM_OF_SAMPLES = 80000
NUM_OF_OTHER = 20000
FRAC_OF_AVCLASS = (NUM_OF_SAMPLES - NUM_OF_OTHER) / malware_df[malware_df.avclass!='other'].shape[0]
# get random samples of malicios samples, sampling by avclass
# get 20000 samples of other

other_samples = malware_df[malware_df.avclass=='other'].sample(NUM_OF_OTHER)
sampled_avc_df = malware_df[malware_df.avclass!='other'].groupby('avclass').sample(frac=FRAC_OF_AVCLASS, random_state=42)
# concat the samples
sampled_malware_df = pd.concat([sampled_avc_df, other_samples])
# sample random benign samples
benign_df = data[data['label']==0].sample(NUM_OF_SAMPLES)
# concat the samples
sampled_df = pd.concat([sampled_malware_df, benign_df])
sampled_df.head(3)

Unnamed: 0,sha256,md5,appeared,label,avclass,size,vsize,has_debug,exports,imports,...,has_tls,symbols,numstrings,avlength,printables,entropy,paths,urls,registry,MZ
4445,054af4c048ba0543e3a6d92fc9ff12b2bd576e218da3a2...,88beacae36d147ac22b792392de6b62b,2018-01,1,adposhel,1790976,1810432,0,0,189,...,1,0,4019,7.757402,31177,6.399078,0,0,0,11
76037,32d12d801ead04cbf83e64e0ab5b6520cc11ec145b3939...,3f26757f25fc2c4422d6365ad47b7fc2,2018-04,1,adposhel,1117184,1130496,1,0,209,...,0,0,1084,8.457565,9168,6.257506,0,0,0,4
91675,6c2220de78d4264db38484ec5007da738555e4bbd0a621...,644603d2778c85f1ea3bc700abf3178c,2018-04,1,adposhel,1117184,1130496,1,0,209,...,0,0,1089,8.417815,9167,6.253843,0,0,0,6


In [19]:
sampled_df.to_csv('data/processed/sample_train.csv', index=False)

# Test Sampling

In [21]:
test_df = pd.read_parquet('data/processed/test.parquet')
test_df.label.value_counts()

1    100000
0    100000
Name: label, dtype: int64

In [24]:
test_df_benign = test_df[test_df.label==0]
test_df_malware = test_df[test_df.label==1].sample(20000)
test_df_sampled = pd.concat([test_df_benign, test_df_malware])
test_df_sampled.label.value_counts()

0    100000
1     20000
Name: label, dtype: int64

In [25]:
test_df_sampled.to_csv('data/processed/sample_test.csv', index=False)

In [42]:
test_df.iloc[0]

sha256             163ced46c18ef09d8e2f0ee4b16decf74a533f22ba3b59...
md5                                 c747678f13ec94deffeab0c44481a988
appeared                                                     2018-11
label                                                              1
avclass                                                        xtrat
size                                                          966594
vsize                                                        2072576
has_debug                                                          0
exports                                                            0
imports                                                            2
has_relocations                                                    0
has_resources                                                      1
has_signature                                                      0
has_tls                                                            0
symbols                           