In [28]:
import arff
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os

# Transfrom arff to csv

In [29]:
def load_arff(arff_file, n_label):
    with open(arff_file, 'r') as f:
        obj = arff.load(f)
        
    data = obj['data']
    attribute_list = [name[0] for name in obj['attributes']]

    data_df = pd.DataFrame(data, columns=attribute_list)
    return data_df

def get_Xy(data, n_label):
    y = data.iloc[:, -n_label:]
    X = data.iloc[:, :-n_label]
    
    return X,y

# get label
def get_label(xml_file):
    with open(xml_file, 'r') as f:
        lines = f.read()
        soup = BeautifulSoup(lines, 'lxml')
        n_label = len(soup.find_all('label'))
        
        label_list = []
        for label in soup.find_all('label'): 
            label_list.append(label.attrs['name'])
        
    return n_label, label_list

In [42]:
fpath = "/Volumes/Samsung_T5/research/data/large_datasets/"
dataset = 'rcv1subset1'
n_label, label_list = get_label(fpath+dataset+"/"+dataset+".xml")
df = load_arff(fpath+dataset+"/"+dataset+".arff", n_label)
X,y = get_Xy(df, n_label)
df.to_csv(fpath+dataset+"/"+dataset+".csv", index=False)
X.to_csv(fpath+dataset+"/"+"X.csv", index=False)
y.to_csv(fpath+dataset+"/"+"y.csv", index=False)

In [15]:
for dataset in ['rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5','tmc2007','mediamill']:
    fpath = "/Volumes/Samsung_T5/research/data/large_datasets/"
    n_label, label_list = get_label(fpath+dataset+"/"+dataset+".xml")
    df = load_arff(fpath+dataset+"/"+dataset+".arff", n_label)
    X,y = get_Xy(df, n_label)

    df.to_csv(fpath+dataset+"/"+dataset+".csv", index=False)
    X.to_csv(fpath+dataset+"/"+"X.csv", index=False)
    y.to_csv(fpath+dataset+"/"+"y.csv", index=False)

# feature selection

In [73]:
def read_data(dataPath):
    # input: '/Volumes/Samsung_T5/research/data/ABC_news_data/obesity/'
    # read data
    data = pd.read_csv(os.path.join(dataPath,'X.csv'))
    label = pd.read_csv(os.path.join(dataPath,'y.csv'))
    return data,label

In [186]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def feature_selection(data_path):
    print(data_path)
    X, y = read_data(data_path) # read data
    print(X.shape)
    print(y.shape)

    selected_features = [] 
    for label in y.columns:
        selector = SelectKBest(chi2, k='all')
        selector.fit(X, y[label])
        selected_features.append(list(selector.scores_))

    # MeanCS 
    # selected_features = np.mean(selected_features, axis=0) > threshold
    # MaxCS
    numOfFeatures = 1500
    a = np.array(selected_features)
    a[np.isnan(a)] = 0
    threshold = sorted(np.max(a, axis=0), reverse=True)[numOfFeatures]
    selected_features_index = (np.max(a, axis=0) > threshold)

    X_selected = X.loc[:,X.columns[selected_features_index]]
    X_selected.to_csv(data_path+"/"+"X_1500.csv",index=False)
    
    

In [187]:
fpath = "/Volumes/Samsung_T5/research/data/large_datasets/"
for dataset in ['rcv1subset2','rcv1subset3','rcv1subset4','rcv1subset5','tmc2007']:
    feature_selection(fpath+dataset)

/Volumes/Samsung_T5/research/data/large_datasets/rcv1subset2
(6000, 47236)
(6000, 101)
/Volumes/Samsung_T5/research/data/large_datasets/rcv1subset3
(6000, 47236)
(6000, 101)
/Volumes/Samsung_T5/research/data/large_datasets/rcv1subset5
(6000, 47235)
(6000, 101)
/Volumes/Samsung_T5/research/data/large_datasets/tmc2007
(28596, 49060)
(28596, 22)


In [188]:
fpath = "/Volumes/Samsung_T5/research/data/large_datasets/"
dataset = 'rcv1subset4'
feature_selection(fpath+dataset)

/Volumes/Samsung_T5/research/data/large_datasets/rcv1subset4
(6000, 47229)
(6000, 101)
