In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from pymfe.mfe import MFE
import math

In [2]:
df=pd.read_csv('dataset.csv')

In [3]:
df.bug.value_counts()

0    579
1    166
Name: bug, dtype: int64

In [4]:
df=df[:720]

In [5]:
group_size=120

In [6]:
lst = [df.iloc[i:i+group_size] for i in range(0,len(df)-group_size+1,group_size)]

In [7]:
def random_sample(df):
    return df.sample(n=50)

In [8]:
def systematic_sampling(df, step=3):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [9]:
def cluster_sampling(df, number_of_clusters=6):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)

        # Create an empty list
        indexes = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indexes.append(i)
        cluster_sample = df.iloc[indexes]
        return(cluster_sample.drop('cluster_id',axis=1))
    
    except:
        print("The population cannot be divided into clusters of equal size!")
        


In [10]:
sm=[random_sample,systematic_sampling,cluster_sampling]

In [11]:
def rank_score(lst,sm,numfolds=2):
    m=len(lst)
    for i in range(0,m):
        dff=lst[i]
        dff=dff.sample(frac=1)
        n=len(sm)
        RankScore=[[0]*n for i in range(0,m)]
        for j in range(0,n):
            RankScore[i][j]=0
            bin_size=60
            bins = [dff.iloc[i:i+bin_size] for i in range(0,len(dff)-bin_size+1,bin_size)]
            for k in range(0,numfolds-1):
                test=bins[k]
                train=dff.drop(test.index)
                BalTrain=sm[j](train)
                BalTrain_x=BalTrain.drop('bug',axis=1)
                BalTrain_y=BalTrain['bug']
                test_x=test.drop('bug',axis=1)
                test_y=test['bug']
                model=GaussianNB()
                model.fit(BalTrain_x,BalTrain_y)
                y_pred=model.predict(test_x)
                accuracy=accuracy_score(test_y,y_pred)*100
                RankScore[i][j]=RankScore[i][j]+accuracy
            RankScore[i][j]=RankScore[i][j]/numfolds
    return RankScore

In [12]:
def cacl_dist(a,b):
    l=len(a)
    sum1=0
    for i in range(0,l):
        if pd.isna(a[i]-b[i])==False:
            sum1+=((a[i]-b[i])**2)
    return math.sqrt(sum1)

In [13]:
def simi_score(lst,nd):
    m=len(lst)
    simiscore=[0]*m
    for i in range(0,m):
        dff=lst[i]
        dff_maj=dff[dff.bug==0]
        dff_min=dff[dff.bug==1]
        nd_maj=nd[nd.bug==0]
        nd_min=nd[nd.bug==1]
        mfe_maj=MFE()
        mfe_maj.fit(np.array(dff_maj.drop('bug',axis=1)),np.array(dff_maj['bug']))
        mf_maj=np.array(mfe_maj.extract()[1])
        mf_maj=np.nan_to_num(mf_maj)
        mfe_min=MFE()
        mfe_min.fit(np.array(dff_min.drop('bug',axis=1)),np.array(dff_min['bug']))
        mf_min=np.array(mfe_min.extract()[1])
        mf_min=np.nan_to_num(mf_min)
        mf_dff=mf_maj+mf_min
        print(mf_dff)
        mfe_nd_maj=MFE()
        mfe_nd_maj.fit(np.array(nd_maj.drop('bug',axis=1)),np.array(nd_maj['bug']))
        mf_nd_maj=np.array(mfe_nd_maj.extract()[1])
        mf_nd_maj=np.nan_to_num(mf_nd_maj)
        mfe_nd_min=MFE()
        mfe_nd_min.fit(np.array(nd_min.drop('bug',axis=1)),np.array(nd_min['bug']))
        mf_nd_min=np.array(mfe_nd_min.extract()[1])
        mf_nd_min=np.nan_to_num(mf_nd_min)
        mf_nd=mf_nd_maj+mf_nd_min
        print(mf_nd)
        print(len(mf_dff),len(mf_nd))

        ed=cacl_dist(mf_dff,mf_nd)
#         temp=mf_dff-mf_nd
        simiscore[i]=ed
    
        

    return simiscore


In [14]:
rs=rank_score(lst[0:4],sm,2)

In [15]:
rs

[[0, 0, 0],
 [0, 0, 0],
 [0, 0, 0],
 [40.833333333333336, 36.666666666666664, 37.5]]

In [16]:
ss=simi_score(lst[0:4],lst[5])

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.58580538e-01  2.72428030e-01  2.66071407e+00  6.37459020e-01
  1.11317254e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999604e-02  1.11803412e-01
  0.00000000e+00  6.33117831e-01  4.84864671e-01  1.81484042e+03
  1.65851380e+04  2.90553435e+04  1.20763548e+05  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.50753477e+01  8.76826157e+01  0.00000000e+00  9.21419655e+00
  2.48977186e+01  6.00000000e+00  5.68025841e+01  1.65311287e+02
  2.66071409e+00  6.37459029e-01  1.79741460e+01  3.07271032e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.49955703e+01  9.91391583e+01  3.82779072e+02  9.73023553e+02
  6.28730875e+01  1.69368906e+02  3.66578754e+01  1.00996505e+02
  1.42634736e+00  2.54854765e+00 -1.12775342e-08  1.10371199e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.92369346e-01  3.28740544e-01  2.61292319e+00  6.45564141e-01
  1.07575078e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999573e-02  1.11803415e-01
  0.00000000e+00  5.99810485e-01  4.77229381e-01  9.10175754e+02
  7.59610070e+03  1.59516187e+04  6.07762985e+04  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.64395699e+01  8.74640685e+01  0.00000000e+00  1.19252467e+01
  3.35164950e+01  6.00000000e+00  5.05887278e+01  1.45899081e+02
  2.61292320e+00  6.45564150e-01  2.02131443e+01  2.85734418e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.28170528e+01  9.47504531e+01  3.41222353e+02  8.44640356e+02
  5.58725866e+01  1.49338610e+02  3.75149370e+01  1.08450880e+02
  1.74842447e+00  3.31401874e+00 -1.09947342e-08  1.32342653e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.37946589e-01  2.49757851e-01  3.15305224e+00  7.09857053e-01
  8.52272727e-01  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -3.88978156e-08  2.14882447e-08
  0.00000000e+00  5.19182527e-01  4.67697569e-01  1.93997463e+03
  1.68896617e+04  3.89563672e+04  1.40855233e+05  2.00000000e+00
  0.00000000e+00 -0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.49707791e+01  8.79505005e+01  0.00000000e+00  1.25034057e+01
  4.12371541e+01  6.00000000e+00  5.67723369e+01  1.65989397e+02
  3.15305226e+00  7.09857064e-01  1.57138053e+01  2.42847694e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.02164786e+01  8.94906230e+01  5.72605863e+02  1.48314258e+03
  6.72220273e+01  1.78611337e+02  3.70240042e+01  1.09038984e+02
  1.33370949e+00  3.87573924e+00 -2.30174795e-08  1.53902935e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.59091130e-01  2.83020609e-01  2.61292319e+00  6.45564141e-01
  1.07575078e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999573e-02  1.11803415e-01
  0.00000000e+00  5.99810485e-01  4.77229381e-01  9.10175754e+02
  7.59610070e+03  1.59516187e+04  6.07762985e+04  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.64395699e+01  8.74640685e+01  0.00000000e+00  1.19252467e+01
  3.35164950e+01  6.00000000e+00  5.05887278e+01  1.45899081e+02
  2.61292320e+00  6.45564150e-01  2.02131443e+01  2.85734418e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.28170528e+01  9.47504531e+01  3.41222353e+02  8.44640356e+02
  5.58725866e+01  1.49338610e+02  3.75149370e+01  1.08450880e+02
  1.74842447e+00  3.31401874e+00 -1.09947342e-08  1.32342653e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.33980589e-01  2.42401877e-01  2.63864574e+00  6.65037607e-01
  1.11317254e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999582e-02  1.11803423e-01
  0.00000000e+00  5.54470793e-01  4.48510464e-01  8.46646512e+02
  6.92509938e+03  4.83249383e+04  2.12637539e+05  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.00887382e+01  1.05144282e+02  0.00000000e+00  1.99127611e+01
  6.62586110e+01  6.00000000e+00  5.22150044e+01  1.44828913e+02
  2.63864575e+00  6.65037615e-01  1.29533326e+01  1.97192348e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.56429917e+01  1.07685697e+02  3.78269350e+02  1.28846104e+03
  6.63597588e+01  2.01337597e+02  4.64171740e+01  1.43899243e+02
  5.35534963e+00  1.70035118e+01 -1.04568488e-08  1.10029247e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.75585476e-01  2.72018130e-01  2.61292319e+00  6.45564141e-01
  1.07575078e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999573e-02  1.11803415e-01
  0.00000000e+00  5.99810485e-01  4.77229381e-01  9.10175754e+02
  7.59610070e+03  1.59516187e+04  6.07762985e+04  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.64395699e+01  8.74640685e+01  0.00000000e+00  1.19252467e+01
  3.35164950e+01  6.00000000e+00  5.05887278e+01  1.45899081e+02
  2.61292320e+00  6.45564150e-01  2.02131443e+01  2.85734418e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.28170528e+01  9.47504531e+01  3.41222353e+02  8.44640356e+02
  5.58725866e+01  1.49338610e+02  3.75149370e+01  1.08450880e+02
  1.74842447e+00  3.31401874e+00 -1.09947342e-08  1.32342653e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.49517706e-01  2.52136411e-01  2.68202531e+00  5.25812569e-01
  1.01052632e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -4.34799678e-08  2.55050383e-08
  0.00000000e+00  6.06939466e-01  5.15582166e-01  5.66310850e+03
  4.36162259e+04  1.17748110e+05  4.64915014e+05  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  4.07296308e+01  1.44532105e+02  0.00000000e+00  2.49121243e+01
  8.36829987e+01  6.00000000e+00  7.85098223e+01  2.19445926e+02
  2.68202533e+00  5.25812574e-01  1.39491216e+01  2.11257806e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  5.38915961e+01  1.59025627e+02  6.49571829e+02  1.84320570e+03
  9.64341620e+01  2.53348887e+02  6.35814006e+01  1.94618047e+02
  5.78040167e+00  1.72676597e+01 -1.09307229e-08  1.09824824e-08
  2.00000000e+00  0.00000

 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').
 Exception message: ValueError('attempt to get argmin of an empty sequence').
 Exception message: ZeroDivisionError('float division by zero').


[ 2.47149832e-01  3.13990498e-01  2.61292319e+00  6.45564141e-01
  1.07575078e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.49999573e-02  1.11803415e-01
  0.00000000e+00  5.99810485e-01  4.77229381e-01  9.10175754e+02
  7.59610070e+03  1.59516187e+04  6.07762985e+04  2.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  2.64395699e+01  8.74640685e+01  0.00000000e+00  1.19252467e+01
  3.35164950e+01  6.00000000e+00  5.05887278e+01  1.45899081e+02
  2.61292320e+00  6.45564150e-01  2.02131443e+01  2.85734418e+01
  2.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
  0.00000000e+00            -inf  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  3.28170528e+01  9.47504531e+01  3.41222353e+02  8.44640356e+02
  5.58725866e+01  1.49338610e+02  3.75149370e+01  1.08450880e+02
  1.74842447e+00  3.31401874e+00 -1.09947342e-08  1.32342653e-08
  2.00000000e+00  0.00000



In [17]:
ss

[2401370.180319371, 389661760.44755787, 5052663.879792994, 6548716.840648406]

In [18]:
def SamplingMethodsRecommendation(rs,ss,sm):
    m=len(ss)
    n=len(sm)
    RecScore=[0]*n
    
    for i in range(0,n):
        sum1=0
        for j in range(0,m):
            sum1+=ss[j]*rs[j][i]
        RecScore[i]=sum1
    RankedN=dict(zip(sm,RecScore))
    RankedN=sorted(RankedN.items(),key=lambda x: x[1],reverse=True)
    return RankedN
        

In [19]:
SamplingMethodsRecommendation(rs,ss,sm)

[(<function __main__.random_sample(df)>, 267405937.65980995),
 (<function __main__.cluster_sampling(df, number_of_clusters=6)>,
  245576881.52431524),
 (<function __main__.systematic_sampling(df, step=3)>, 240119617.49044153)]