In [5]:
import pandas as pd
import datetime
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore") 
from matplotlib import pyplot as plt
from sklearn.svm import SVR
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from scipy import stats
from sklearn.metrics import precision_score

In [6]:
Filepath = 'C:/Users/ChowdhKa/Downloads/Kakoli Masters/MRP/All File/'

In [7]:
#read csv
df=pd.read_csv(Filepath+'OctDataSmall.csv')
df.columns = ['StartDate', 'EndDate', 'Channel', 'Pwr_dbm','Snr', 'Occupancy']
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['StartDate'] = df['StartDate'].values.astype('<M8[h]')
df['EndDate'] = df['EndDate'].values.astype('<M8[h]')
df.head()

Unnamed: 0,StartDate,EndDate,Channel,Pwr_dbm,Snr,Occupancy
0,2016-10-06,2016-10-06 01:00:00,56303,-83.293709,21.397917,100.0
1,2016-10-06,2016-10-06 01:00:00,56304,-63.193748,41.497952,100.0
2,2016-10-06,2016-10-06 01:00:00,56305,-31.322813,73.445999,100.0
3,2016-10-06,2016-10-06 01:00:00,56306,-64.096245,41.568588,100.0
4,2016-10-06,2016-10-06 01:00:00,56307,-83.440269,21.249647,100.0


In [8]:
#putting dataframe in another dataframe 
dataf = df

In [9]:
# Take the data frame from R file
df_merged=pd.read_csv(Filepath+'testcsv.csv',delimiter="\t")
#print(df_merged)
df_merged.columns = ['Channel','StartDate', 'EndDate', 'Pwr_dbm','Snr', 'Occupancy','per_read']
df_merged['StartDate'] = pd.to_datetime(df_merged['StartDate'])
df_merged['EndDate'] = pd.to_datetime(df_merged['EndDate'])
df_merged['StartDate'] = df_merged['StartDate'].values.astype('<M8[h]')
df_merged['EndDate'] = df_merged['EndDate'].values.astype('<M8[h]')
df_merged.head()

Unnamed: 0,Channel,StartDate,EndDate,Pwr_dbm,Snr,Occupancy,per_read
1,10,2016-10-07 08:00:00,2016-10-07 09:00:00,-91.5,11.7,0.074493,82.82504
2,10,2016-10-21 21:00:00,2016-10-21 22:00:00,-90.774193,12.806452,0.466165,82.82504
3,10,2016-10-11 01:00:00,2016-10-11 02:00:00,-91.59259,11.444445,0.203912,82.82504
4,10,2016-10-21 12:00:00,2016-10-21 13:00:00,-91.340424,11.851064,0.3528,82.82504
5,10,2016-10-20 23:00:00,2016-10-21 00:00:00,-92.064514,11.580646,0.237639,82.82504


In [10]:
# Taking only the required columns
df_merged = df_merged[['Channel','StartDate', 'EndDate', 'Occupancy']]

In [11]:
#Getting the final channel list

channel = df_merged.Channel.unique().tolist()
channelList = np.random.choice(channel, 300, replace=False)

In [12]:
#channelList.tolist()

In [13]:
#channelList = [20750,22320,23200,28430,34680,34690,50267,15220,15530,15570,15820,16010,16130]#,
               #21340,24220,25410,53970,54006,54494,54620,56097,56293,56305,56306,56307]

In [14]:
#channelList = [20750,22320,23200,28430]
               #,34680,34690,50267,15220,15530,15570,15820,16010,16130,
               #21340,24220,25410,53970,54006,54494,54620,56097,56293,56305,56306,56307]

In [15]:
OriginalMergedDF = df_merged

In [16]:
def ProcessDF(channel, threshold):
    InterDF = OriginalMergedDF[(OriginalMergedDF.Channel == channel)][['StartDate', 'EndDate', 'Channel', 'Occupancy']]
    InterDF = InterDF.set_index('StartDate')
    InterDF = InterDF.sort_index()
    indexStart = InterDF.index[0]
    indexEnd = InterDF.index[-1]
    InterDF= InterDF.reindex(pd.date_range(start=indexStart, end=indexEnd, freq='1h'))
    shiftVales = 2
    InterDF['Occupancy1'] = InterDF['Occupancy'].shift()
    while shiftVales <=6:
        InterDF['Occupancy'+str(shiftVales)] = InterDF['Occupancy'+str(shiftVales-1)].shift()
        shiftVales=shiftVales+1
    # Convert index back to dataframe column
    InterDF.reset_index(level=0, inplace=True)
    InterDF.drop(InterDF.columns[[0,1]], axis=1, inplace=True)
    InterDF = InterDF.dropna(how='any', axis = 0)
    #InterDF['OccupancyLabel'] = np.where(InterDF['Occupancy'] > threshold, 1,0)
    InterDF['OccupancyLabel'] = pd.cut(InterDF['Occupancy'], [-np.inf, threshold, np.inf], labels=[0, 1])
    return InterDF

In [17]:
def GausNB(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    confusion_NB = confusion_matrix(y_test, predicted, labels=[0,1])
    # Accuracy
    scoreNB = accuracy_score(y_test, predicted)
    # Sensitivity
    senseNB = confusion_NB[1,1]/(confusion_NB[1,0]+confusion_NB[1,1])
    # Specificity
    specificNB = confusion_NB[0,0]/(confusion_NB[0,0]+confusion_NB[0,1])
    # F-measure
    fmeasureNB = f1_score(y_test, predicted,average='weighted')
    # Precision
    precisionscoreNB = precision_score(y_test, predicted, average='weighted')


    return scoreNB, senseNB, specificNB, fmeasureNB, precisionscoreNB

In [18]:
def LogReg(X_train, y_train, X_test, y_test):
    reg = LogisticRegression()
    modfit = reg.fit(X_train, y_train)
    yPred = modfit.predict(X_test)
    confusion_LR = confusion_matrix(y_test, yPred, labels=[0,1])
    # Accuracy
    # scoreLR = modfit.score(X_test, y_test)
    scoreLR = (confusion_LR[0,0]+confusion_LR[1,1])/(confusion_LR[0,0]+confusion_LR[0,1]+confusion_LR[1,0]+confusion_LR[1,1])
    # Sensitivity
    senseLR = confusion_LR[1,1]/(confusion_LR[1,0]+confusion_LR[1,1])
    # Specificity
    specificLR = confusion_LR[0,0]/(confusion_LR[0,0]+confusion_LR[0,1])
    # F-measure
    fmeasureLR = f1_score(y_test, yPred, average='weighted')
    # Precision
    precisionscoreLR = precision_score(y_test, yPred, average='weighted')

    return scoreLR, senseLR, specificLR, fmeasureLR, precisionscoreLR

In [19]:
#print(k[1])

In [20]:
# Defining the threshold for class labeling
SaveDataDF = pd.DataFrame([])
t = 70
#list_threshold = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
#list_threshold = [40, 50, 60, 70]
#for t in list_threshold:
for channel in channelList:
    #print('Value of Threshold: ' + str(t))
    #print('Current channel is: '+ str(channel))

    ProcessedDF = ProcessDF(channel, t)
    ProcessedDFx = ProcessedDF[(ProcessedDF.Channel == channel)][['Channel', 'Occupancy1', 'Occupancy2', 'Occupancy3', 'Occupancy4', 'Occupancy5', 'Occupancy6']]
    ProcessedDFy = ProcessedDF[(ProcessedDF.Channel == channel)][['OccupancyLabel']] 
    #print(ProcessedDFy.OccupancyLabel.unique())
    #print(ProcessedDFy['OccupancyLabel'].value_counts())
    k = ProcessedDFy.groupby('OccupancyLabel').size()
    #print('k[0]: '+ str(k[0]))
    #print('k[1]: '+ str(k[1]))
    if (k[0] > 5 and k[1] > 5):
        RunIter = 1
        while RunIter <= 10:
            foldCounter = 1
            kc = KFold(n_splits = 10, shuffle=True)
            for train_x, test_x in kc.split(ProcessedDFx, ProcessedDFy):
                X_trainDF, X_testDF = ProcessedDFx.iloc[train_x], ProcessedDFx.iloc[test_x]
                y_trainDF, y_testDF = ProcessedDFy.iloc[train_x], ProcessedDFy.iloc[test_x]


                # Naive Bayes
                scoreNB, sensitivityNB, specificityNB, fmeasureNB, precisionNB = GausNB(X_trainDF, y_trainDF, X_testDF, y_testDF)
                #if len(y_trainDF.OccupancyLabel.unique()) > 1:
                # Logistic Regression
                scoreLR, sensitivityLR, specificityLR, fmeasureLR, precisionLR = LogReg(X_trainDF, y_trainDF, X_testDF, y_testDF)

                # PRINT ROW VALUE IN DATAFRAME- KEEP APPENDING
                SaveDataDF = SaveDataDF.append(pd.DataFrame({'Channel': channel, 'Threshold': t,
                                                             'RunCount': RunIter,'FoldCount': foldCounter,
                                                             'AccuracyLR': scoreLR, 'SensitivityLR': sensitivityLR,
                                                             'SpecificityLR': specificityLR, 'AccuracyNB': scoreNB,
                                                             'SensitivityNB': sensitivityNB, 'SpecificityNB': specificityNB,
                                                             'fmeasureNB': fmeasureNB, 'fmeasureLR': fmeasureLR,
                                                             'precisionLR': precisionLR, 'precisionNB': precisionNB
                                                            }, 
                                                            index=[0]), ignore_index=True)
            foldCounter = foldCounter + 1
            RunIter = RunIter + 1


In [21]:
#fmeasureLR

In [22]:
#ProcessedDFy.head()

In [23]:
ProcessedDFx.head()

Unnamed: 0,Channel,Occupancy1,Occupancy2,Occupancy3,Occupancy4,Occupancy5,Occupancy6
6,16150.0,100.0,100.0,100.0,100.0,100.0,100.0
7,16150.0,100.0,100.0,100.0,100.0,100.0,100.0
8,16150.0,100.0,100.0,100.0,100.0,100.0,100.0
9,16150.0,100.0,100.0,100.0,100.0,100.0,100.0
10,16150.0,100.0,100.0,100.0,100.0,100.0,100.0


In [24]:
SaveDataDF.head()

Unnamed: 0,AccuracyLR,AccuracyNB,Channel,FoldCount,RunCount,SensitivityLR,SensitivityNB,SpecificityLR,SpecificityNB,Threshold,fmeasureLR,fmeasureNB,precisionLR,precisionNB
0,0.836735,0.836735,51401,1,1,0.789474,0.789474,0.866667,0.866667,70,0.836735,0.836735,0.836735,0.836735
1,0.938776,0.857143,51401,1,1,0.842105,0.842105,1.0,0.866667,70,0.937609,0.857764,0.944341,0.859113
2,0.795918,0.77551,51401,1,1,0.823529,0.882353,0.78125,0.71875,70,0.799857,0.78089,0.814383,0.817653
3,0.836735,0.816327,51401,1,1,0.772727,0.863636,0.888889,0.777778,70,0.835763,0.816786,0.83765,0.823367
4,0.836735,0.693878,51401,1,1,0.777778,0.666667,0.870968,0.709677,70,0.836735,0.697869,0.836735,0.706997


In [25]:
SaveDataDF.to_csv(Filepath+'OutputData.csv', header=False,sep='\t', encoding='utf-8', index=False)

In [26]:
readResults = pd.read_csv(Filepath+'OutputData.csv', header=1, sep ='\t')

In [27]:
readResults.columns = ['AccuracyLR', 'AccuracyNB', 'Channel', 'FoldCount', 'RunCount', 
                    'SensitivityLR', 'SensitivityNB','SpecificityLR', 
                    'SpecificityNB','Threshold','FmeasureLR', 'FmeasureNB', 'PrecisionLR', 'PrecisionNB']

In [28]:
readResults.head()

Unnamed: 0,AccuracyLR,AccuracyNB,Channel,FoldCount,RunCount,SensitivityLR,SensitivityNB,SpecificityLR,SpecificityNB,Threshold,FmeasureLR,FmeasureNB,PrecisionLR,PrecisionNB
0,0.795918,0.77551,51401,1,1,0.823529,0.882353,0.78125,0.71875,70,0.799857,0.78089,0.814383,0.817653
1,0.836735,0.816327,51401,1,1,0.772727,0.863636,0.888889,0.777778,70,0.835763,0.816786,0.83765,0.823367
2,0.836735,0.693878,51401,1,1,0.777778,0.666667,0.870968,0.709677,70,0.836735,0.697869,0.836735,0.706997
3,0.795918,0.653061,51401,1,1,0.866667,0.733333,0.764706,0.617647,70,0.80305,0.666631,0.833819,0.723163
4,0.791667,0.791667,51401,1,1,0.631579,0.736842,0.896552,0.827586,70,0.786132,0.791667,0.792677,0.791667


In [29]:
pVal = stats.ttest_ind(readResults['AccuracyLR'],readResults['AccuracyNB'])
pValue = pVal[1]

In [30]:
readResults = readResults.drop(['FoldCount', 'RunCount'], 1)

In [31]:
PerThresPerChannelDF = readResults.groupby(['Channel','Threshold']).mean().add_suffix('_mean').reset_index()

In [32]:
PerThresPerChannelDF.head()

Unnamed: 0,Channel,Threshold,AccuracyLR_mean,AccuracyNB_mean,SensitivityLR_mean,SensitivityNB_mean,SpecificityLR_mean,SpecificityNB_mean,FmeasureLR_mean,FmeasureNB_mean,PrecisionLR_mean,PrecisionNB_mean
0,830,70,0.95307,0.916475,0.341429,0.629577,0.984005,0.930191,0.947168,0.932312,0.948626,0.955398
1,3860,70,0.904154,0.849826,0.638723,0.672474,0.960224,0.885313,0.899629,0.856558,0.904221,0.870664
2,7810,70,0.988873,0.97199,0.737708,0.758958,0.997871,0.979766,0.986896,0.97455,0.987942,0.981223
3,10670,70,0.955536,0.921199,0.966908,0.899781,0.943413,0.943551,0.955515,0.921194,0.957217,0.924729
4,11810,70,0.919349,0.893002,0.924831,0.881012,0.912559,0.903866,0.91928,0.892964,0.922165,0.896856


In [33]:
PerThresPerChannelDF['CountChan'] = 1
aggregator = {'AccuracyLR_mean' : 'mean',
              'AccuracyNB_mean' : 'mean',
              'SensitivityLR_mean': 'mean',
              'SensitivityNB_mean' : 'mean',
              'SpecificityLR_mean': 'mean',
              'SpecificityNB_mean' : 'mean',
              'FmeasureLR_mean':'mean',
              'FmeasureNB_mean':'mean',
              'PrecisionLR_mean': 'mean',
              'PrecisionNB_mean': 'mean',
              'CountChan': 'sum'}
FinalResultsDF = PerThresPerChannelDF.groupby('Threshold', as_index=False)['AccuracyLR_mean', 'AccuracyNB_mean', 
                                                                 'SensitivityLR_mean', 'SensitivityNB_mean', 
                                                                 'SpecificityLR_mean', 'SpecificityNB_mean',
                                                                 'PrecisionLR_mean','PrecisionNB_mean',
                                                                 'FmeasureLR_mean', 'FmeasureNB_mean'].agg(aggregator)


In [34]:
#PerThresPerChannelDF = PerThresPerChannelDF.drop(['Channel'], 1)
#FinalResultsDF = PerThresPerChannelDF.groupby(['Threshold']).mean().reset_index()
FinalResultsDF['Cluster'] = '0'
FinalResultsDF['pValue'] = pValue

In [35]:
FinalResultsDF

Unnamed: 0,Threshold,AccuracyLR_mean,AccuracyNB_mean,SensitivityLR_mean,SensitivityNB_mean,SpecificityLR_mean,SpecificityNB_mean,FmeasureLR_mean,FmeasureNB_mean,PrecisionLR_mean,PrecisionNB_mean,CountChan,Cluster,pValue
0,70,0.953279,0.912492,0.758168,0.84203,0.872748,0.872794,0.950252,0.92131,0.951986,0.937681,53,0,4.85416e-252


In [36]:
# Write the results to a CSV
FinalResultsDF.to_csv(Filepath+'Output/FinalResults.csv', header=False,sep='\t', index=False, mode = 'a')

In [43]:
Result = pd.read_csv(Filepath+'Output/FinalResults.csv', header=None, sep ='\t')

In [44]:
Result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,70,0.953279,0.912492,0.758168,0.84203,0.872748,0.872794,0.950252,0.92131,0.951986,0.937681,53,0,4.85416e-252
1,70,0.939533,0.901293,0.910757,0.891423,0.940299,0.904008,0.939007,0.903074,0.942067,0.91132,96,14,3.8997080000000005e-52
2,70,0.948458,0.878009,0.462043,0.731838,0.972968,0.878027,0.943943,0.899268,0.943976,0.93607,81,2,1.28052e-127


In [45]:
Result.columns = ['Threshold', 'LRaccuracy', 'NBaccuracy', 
                       'LRsensitivity', 'NBsensitivity', 'LRspecificity', 
                       'NBspecificity', 'LRfmeasure', 'NBfmeasure', 'LRprecision', 
                       'NBprecision', 'CountChan', 'Cluster', 'pValue']

In [46]:
Result = Result[(Result['Threshold'] == 70)]

In [47]:
Result

Unnamed: 0,Threshold,LRaccuracy,NBaccuracy,LRsensitivity,NBsensitivity,LRspecificity,NBspecificity,LRfmeasure,NBfmeasure,LRprecision,NBprecision,CountChan,Cluster,pValue
0,70,0.953279,0.912492,0.758168,0.84203,0.872748,0.872794,0.950252,0.92131,0.951986,0.937681,53,0,4.85416e-252
1,70,0.939533,0.901293,0.910757,0.891423,0.940299,0.904008,0.939007,0.903074,0.942067,0.91132,96,14,3.8997080000000005e-52
2,70,0.948458,0.878009,0.462043,0.731838,0.972968,0.878027,0.943943,0.899268,0.943976,0.93607,81,2,1.28052e-127


In [48]:
#Result.to_csv(Filepath+'Output/FinalResultstoCompare.csv', header=False,sep='\t', index=False, mode = 'a')