In [1]:
%pylab inline
import pandas
from collections import Counter

Populating the interactive namespace from numpy and matplotlib


### Read in the whole dataset:

In [2]:
df = pandas.read_csv("../CND-features.csv")
print(df.shape[0], "network x", df.shape[1], "feature")
df.head()

4729 network x 210 feature


Unnamed: 0,Id,AverageDegreeConnectivity_max,AverageDegreeConnectivity_max-1,AverageDegreeConnectivity_min,AverageDegreeConnectivity_min-1,AverageDegreeConnectivity_mom1,AverageDegreeConnectivity_mom2,AverageDegreeConnectivity_mom3,AverageDegreeConnectivity_mom4,BetweenessNodeCentrality_max,...,RichClubCoef_max,RichClubCoef_max-1,RichClubCoef_min,RichClubCoef_min-1,RichClubCoef_mom1,RichClubCoef_mom2,RichClubCoef_mom3,RichClubCoef_mom4,ShanonEntropy,CND
0,Norwegian_Board_of_Directors_net1mode_2007-12-01,15.296296,13.625,4.0,6.6,0.621266,0.407503,0.281137,0.203637,86807.878666,...,1.0,1.0,0.008294,0.008311,0.172207,0.082552,0.065968,0.062088,2.476406,affiliation
1,Norwegian_Board_of_Directors_net1mode_2006-08-01,17.728395,17.545455,3.0,6.088235,0.528306,0.319825,0.221987,0.17387,47816.047583,...,0.468254,0.384946,0.01226,0.012298,0.339844,0.180662,0.117902,0.087885,2.420699,affiliation
2,Norwegian_Board_of_Directors_net1mode_2006-09-01,18.261438,17.545455,3.0,6.088235,0.518756,0.307042,0.207665,0.158355,57491.777778,...,0.468254,0.384946,0.012319,0.012356,0.319307,0.162343,0.104315,0.078009,2.4407,affiliation
3,Norwegian_Board_of_Directors_net1mode_2006-10-01,18.37963,17.285714,3.0,6.575,0.543699,0.328027,0.218016,0.158337,59455.229336,...,0.438424,0.342803,0.0,0.0,0.275211,0.142274,0.090978,0.066803,2.512424,affiliation
4,Norwegian_Board_of_Directors_net1mode_2006-11-01,18.307018,15.055556,6.457143,6.842105,0.543538,0.317351,0.200886,0.138364,47108.001407,...,0.438424,0.342803,0.0,0.0,0.267398,0.136614,0.087487,0.064722,2.482343,affiliation


### Drop entries not calculated at least in 80%:

In [3]:
df.dropna(axis = 0, thresh = 0.8*(df.shape[1]-2), inplace = True)
df = df.reset_index(drop = True)
print("New shape: ", df.shape)

New shape:  (4368, 210)


### Merge small groups ( $<10$ ) into one big group named "other":

In [4]:
for CND in unique(df.CND):
    if (len(df[df.CND==CND]) < 10):
        df.loc[list(df[df.CND==CND].index), "CND"] = "other"

### Number of elements in each group:

In [5]:
Counter(df.CND)

Counter({'affiliation': 111,
         'animal': 547,
         'brain': 29,
         'cheminformatics': 514,
         'other': 48,
         'digital': 37,
         'economic': 27,
         'facebook': 51,
         'genetic': 35,
         'metabolic': 2634,
         'protein': 73,
         'retweet': 30,
         'road': 17,
         'social': 102,
         'software': 16,
         'tissue': 32,
         'transport': 51,
         'web': 14})

### Drop infinity value (only one):

In [6]:
dinf = isinf(df[df.columns[1:-1]]).sum(axis=1)
df = df.drop(array(dinf[dinf>0].index))
df = df.reset_index(drop = True)

### Handling missing values:
1. Save features from groups which are not calculated at least in 80% (later this will not be taken into account)
2. Impute missing values with random values generated between the first and third quartiles
3. If values are integer numbers than input them with integer values between the specified quartiles

In [7]:
def IsIntegerArray(arr):
    arr = arr.astype("float")
    temp = array([arr[i].is_integer() for i in range(len(arr))]) 
    return temp.sum() == len(arr)

for CND in unique(df.CND):
    dfg = df[df.CND==CND]
    indgroup = list(dfg.index)
    nulls = dfg.isnull().sum()[1:-1]/len(dfg)*100
    todelete = where(nulls>=20)[0]
    if len(todelete)>0:
        savetxt("deletefeatures/"+CND+".txt", list(dfg.columns[1:-1][todelete]), fmt="%s")
    toimpute = where(nulls>0)[0]
    if len(toimpute)>0:
        imputefeats = list(dfg.columns[1:-1][toimpute])
        for feat in imputefeats:
            vals = df.loc[indgroup][feat]
            vals = vals[~isnan(vals)]
            if (not IsIntegerArray(vals.values)):
                quartiles = vals.quantile([0.25, 0.75]).values
                rndnums = uniform(low=quartiles[0], high=quartiles[1], size=df.loc[indgroup][feat].isnull().sum())
            else:
                quartiles = vals.quantile([0.25, 0.75]).values.astype(int)
                quartiles = unique(quartiles)
                if (len(quartiles)==2):
                    rndnums = randint(low=quartiles[0], high=quartiles[1]+1, 
                                      size=df.loc[indgroup][feat].isnull().sum())
                else:
                    rndnums = ones(int(df.loc[indgroup][feat].isnull().sum()))*quartiles[0]
            indx2impute = df.loc[indgroup][df.loc[indgroup][feat].isnull()].index.tolist()
            df.loc[indx2impute, feat] = rndnums

In [8]:
df.to_csv("CND-features_processed.csv", index=False)