In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.svm import SVC
from skmultiflow.data import DataStream

In [3]:
def make_sparse(label):
    i = np.random.randint(1,10)
    if (i<=8):
        label = np.nan # cannot convert float NaN to integer
        return label
    else:
        return label[0]

In [4]:
data_orig = pd.read_csv("./Data/package_data/sea.csv")#, nrows = 25000)
#data_orig = pd.read_pickle("./Data//own_data/own_concept_drift_small_2")#, nrows = 25000)
data_orig.describe()

Unnamed: 0,feat1,feat2,feat3,label
count,60000.0,60000.0,60000.0,60000.0
mean,4.997519,5.007497,5.012034,0.626933
std,2.883807,2.883343,2.887424,0.483624
min,2.9e-05,0.000187,0.000208,0.0
25%,2.502614,2.514903,2.515473,0.0
50%,5.012329,5.015645,5.020778,1.0
75%,7.500016,7.504855,7.510801,1.0
max,9.999989,9.999788,9.999893,1.0


In [819]:
df_sparse = data_orig.copy()
data_sparse = [make_sparse(label) for label in zip(df_sparse['label'])]
df_sparse['label'] = data_sparse

Transductive Learning

In [820]:
def self_learning(sparse_samples_df, clf, threshold_prob, threshold_amount):
    # predicte die konfidenz für die ungelabelten und sortiere nach konfidenz
    probas = pd.DataFrame(clf.predict_proba(sparse_samples_df), columns=['probas_class1', 'probas_class2'])
    probas['max'] = probas.max(axis=1)
    probas['pred_class'] = clf.predict(sparse_samples_df)

    sparse_samples_df = sparse_samples_df.reset_index(drop=True)
    probas = probas.reset_index(drop=True)

    sparse_samples_df['probas_class1'] = probas['probas_class1']
    sparse_samples_df['probas_class2'] = probas['probas_class2']
    sparse_samples_df['max'] = probas['max']
    sparse_samples_df['pred_class'] = probas['pred_class']

    # assign new labels to most confident tuples
    sparse_samples_df = sparse_samples_df.sort_values(by=['max'], ascending=False)
    sparse_samples_df = sparse_samples_df.reset_index(drop=True)
    sparse_samples_df["label"] = sparse_samples_df.apply(lambda row: row["pred_class"] if row["max"] > threshold_prob and row.name < threshold_amount else np.nan, axis=1)
    
    new_labeled_df = sparse_samples_df[sparse_samples_df['label'].isnull() != True]
    new_labeled_df = new_labeled_df.drop(['probas_class1', 'probas_class2', 'max', 'pred_class'], axis=1)
    
    sparse_samples_df = sparse_samples_df[sparse_samples_df['label'].isnull()]
    sparse_samples_df = sparse_samples_df.drop(['probas_class1', 'probas_class2', 'max', 'pred_class', 'label'], axis=1)

    return (sparse_samples_df, new_labeled_df)

In [833]:
clf = SVC(probability=True)
stream = DataStream(data = df_sparse)
stream.prepare_for_use()

X, y = stream.next_sample(1000)

# unterscheide zwischen labelled und unlabeled samples, bzw. ziehe die ungelabelten raus
df_mixed = pd.DataFrame(X, columns=['F1','F2', 'F3'])#, 'F4', 'concept'])
df_mixed['label'] = y

sparse_samples_df = df_mixed[df_mixed['label'].isnull()]
sparse_samples_df = sparse_samples_df.drop(columns=['label'], axis=1)

df_labeled = df_mixed[df_mixed['label'].isnull() != True]
clf.fit(df_labeled.iloc[:,:-1], df_labeled.iloc[:,-1]) 

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [834]:
i = 0
while (len(df_labeled) < 900 and i <200):
    i+=1
    # füge gelabelte Instanzen iterativ zu dataframe hinzu und trainiere dann neu
    sparse_samples_df, new_labeled_df = self_learning(sparse_samples_df, clf, 0.93, 400)
    df_labeled = pd.concat([df_labeled, new_labeled_df])
    clf.fit(df_labeled.iloc[:,:-1], df_labeled.iloc[:,-1])
    #print('remaining:', len(sparse_samples_df), ' newly labelled:', len(new_labeled_df))

In [835]:
len(df_labeled)

877

Vergleichen

In [836]:
X_test = data_orig.iloc[1000:2000,:-1]
y_test = data_orig.iloc[1000:2000,-1]

clf1 = SVC()#class_weight parameter -> minority class stärker gewichten!
clf2 = SVC()

clf1.fit(df_labeled.iloc[:,:-1], df_labeled.iloc[:,-1])
clf2.fit(data_orig.iloc[:1000,:-1], data_orig.iloc[:1000,-1])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [837]:
score1 = clf1.score(X_test, y_test)
score2 = clf2.score(X_test, y_test)

data_orig = data_orig.iloc[:1000,:]
df_labeled.columns = ['feat1', 'F2', 'F3', 'label_gen']
df_labeled.columns = ['feat1', 'F2', 'F3', 'label_gen']
df_merged = pd.merge(data_orig, df_labeled, on='feat1', how='right')

df_diff = df_merged[df_merged['label']!=df_merged['label_gen']]

In [838]:
print((1-(len(df_mixed[df_mixed['label'].isnull()])/len(df_mixed)))*100, '% labels available')
print((len(df_labeled)/1000)*100, '%', 'or', len(df_labeled), 'labels of missing labels classified.')
print((len(df_diff)/len(df_labeled))*100, '% of missing labels wrongly classified.')
print((score1 - score2)*100, '% accuracy loss by using self-learned labels')

9.799999999999997 % labels available
87.7 % or 877 labels of missing labels classified.
11.060433295324971 % of missing labels wrongly classified.
-4.200000000000004 % accuracy loss by using self-learned labels


In [827]:
df_merged

Unnamed: 0,feat1,feat2,feat3,label,F2,F3,label_gen
0,9.874437,8.817701,4.786266,1,8.817701,4.786266,1.0
1,7.118725,2.990575,1.964403,0,2.990575,1.964403,0.0
2,6.128244,8.449696,2.604408,1,8.449696,2.604408,1.0
3,1.255071,5.924060,8.626920,1,5.924060,8.626920,1.0
4,7.816036,1.068024,8.051717,1,1.068024,8.051717,1.0
...,...,...,...,...,...,...,...
867,8.113069,2.883250,1.464119,0,2.883250,1.464119,0.0
868,1.383559,7.634090,5.912263,0,7.634090,5.912263,1.0
869,7.469916,3.201049,0.177718,0,3.201049,0.177718,0.0
870,4.401972,4.421945,6.598413,1,4.421945,6.598413,1.0


In [828]:
df_merged[df_merged['label']!=df_merged['label_gen']]

Unnamed: 0,feat1,feat2,feat3,label,F2,F3,label_gen
10,2.371031,8.468529,7.585784,0,8.468529,7.585784,1.0
22,1.818147,0.838802,6.343037,0,0.838802,6.343037,1.0
25,4.042532,2.495767,6.475234,0,2.495767,6.475234,1.0
28,2.696145,7.646451,5.735073,0,7.646451,5.735073,1.0
32,0.484697,8.138923,9.671927,0,8.138923,9.671927,1.0
...,...,...,...,...,...,...,...
814,8.028746,6.464602,5.289002,0,6.464602,5.289002,1.0
819,5.188537,1.346923,6.062836,0,1.346923,6.062836,1.0
832,9.024073,1.862522,7.640088,0,1.862522,7.640088,1.0
857,8.215626,7.375809,2.869523,0,7.375809,2.869523,1.0
