In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import svm
import time

geno = np.loadtxt('/datasets/cs284-sp21-A00-public/ps2/precomputed/ps2_pca.genotypes.tab') # (28622 SNPs, 2504 people)
pop_6 = np.load('pop_6.npy')

In [2]:
train = geno[:,:2000]
test = geno[:,2000:]
train_Y = pop_6[:2000]
test_Y = pop_6[2000:]

数据集0.8/0.2划分，可以尝试些别的分类算法。直接在整个数据集上run的时间非常长(可能是datahub或svm的原因)

In [3]:
def simple_SVM(X,Y,test_X,test_Y):
    time0=time.time()
    
    clf = svm.SVC(kernel='linear')
    clf.fit(X,Y)
    
    print('Train time:',time.time()-time0)
    
    print('train acc:',clf.score(X, Y))
    print('test acc:',clf.score(test_X, test_Y))

In [4]:
# simple_SVM(train.T,train_Y,test.T,test_Y)

### chi2 selection

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

time1 = time.time()
chi2_selection = SelectKBest(chi2, k=5000).fit(train.T, np.array(train_Y))
X_select_index = chi2_selection.get_support(indices=True)
print('Time:',time.time()-time1)
simple_SVM(train[X_select_index,:].T,train_Y,test[X_select_index,:].T,test_Y)

Time: 1.5748379230499268
Train time: 4.332159519195557
train acc: 1.0
test acc: 0.9027777777777778


### pca comparison

In [6]:
from sklearn.decomposition import PCA

time1=time.time()

pca = PCA(n_components=4).fit(train.T)
train_pca = pca.transform(train.T)
test_pca = pca.transform(test.T)

print('pca time:',time.time()-time1)

simple_SVM(train_pca,train_Y,test_pca,test_Y)

pca time: 34.71433424949646
Train time: 0.14259600639343262
train acc: 0.9555
test acc: 0.9226190476190477


In [7]:
for thres in [5000,10000,15000]:
    time1=time.time()
    
    chi2_selection = SelectKBest(chi2, k=thres).fit(train.T, np.array(train_Y))
    X_select_index = chi2_selection.get_support(indices=True)
    
    pca = PCA(n_components=4).fit(train[X_select_index].T)
    train_pca = pca.transform(train[X_select_index].T)
    test_pca = pca.transform(test[X_select_index].T)
    
    print('preprocess time:',time.time()-time1)

    simple_SVM(train_pca,train_Y,test_pca,test_Y)

preprocess time: 10.013997554779053
Train time: 0.24165749549865723
train acc: 0.9495
test acc: 0.9126984126984127
preprocess time: 15.121902465820312
Train time: 0.14634490013122559
train acc: 0.9525
test acc: 0.9206349206349206
preprocess time: 31.267665147781372
Train time: 0.222670316696167
train acc: 0.9535
test acc: 0.9186507936507936


### InfoGain

In [None]:
from collections import defaultdict
from math import log

def calcShannonEnt(Y):
    numEntries = len(Y)
    labelCounts = defaultdict(int)
    for label in Y:
        labelCounts[label] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt

def calcInfoGain(dataset,Y):
    baseEntropy = calcShannonEnt(Y)
    InfoGain_list = []
    for i in range(dataset.shape[0]):
        subset_list = [[],[],[]]
        for j in range(dataset.shape[1]):
            subset_list[int(dataset[i,j])].append(j)
        newEntropy = 0.0
        splitInfo = 0.0
        for value in [0,1,2]:
            if len(subset_list[value])==0:
                continue
            prob = len(subset_list[value])/float(dataset.shape[1])
            newEntropy += prob * calcShannonEnt(Y[subset_list[value]])
            splitInfo += -prob * log(prob, 2)
        infoGain = baseEntropy - newEntropy
        if (splitInfo == 0):
            continue
        infoGainRatio = infoGain / splitInfo
        InfoGain_list.append(infoGainRatio)
    return InfoGain_list

In [None]:
InfoGain_list = calcInfoGain(train,train_Y)
rank_infoGain = np.array(InfoGain_list).argsort()

In [None]:
for i in [100,500,1000,5000,10000]:
    simple_SVM(train[rank_infoGain[-i:],:].T,train_Y,test[rank_infoGain[-i:],:].T,test_Y)

## Random selection can even work well!!!

In [None]:
import random

rl = random.sample(range(28622), 5000)
simple_SVM(train[rl,:].T,train_Y,test[rl,:].T,test_Y)