In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.optim import SGD
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

seed = 3047
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

In [2]:
!gdown 1o0m3jyfmetUOJ146TqHuEGUWwQyC7JXV
!gdown 1B5OC3R0yM8F7yjoYOKu3t08QZalcr7DC
!gdown 1THvOuf_EOn6c_6TLy0Bqs23BP2NraBR2

Downloading...
From: https://drive.google.com/uc?id=1o0m3jyfmetUOJ146TqHuEGUWwQyC7JXV
To: /content/train.csv
100% 6.54M/6.54M [00:00<00:00, 93.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1B5OC3R0yM8F7yjoYOKu3t08QZalcr7DC
To: /content/val.csv
100% 665k/665k [00:00<00:00, 87.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1THvOuf_EOn6c_6TLy0Bqs23BP2NraBR2
To: /content/X_test
100% 3.57M/3.57M [00:00<00:00, 106MB/s]


In [28]:
class SVM(nn.Module):
  def __init__(self):
    super(SVM, self).__init__() 
    self.w = nn.Parameter(torch.randn((1, 4)).to(torch.float32))
    self.f = nn.Sequential(
                  nn.Linear(43, 4),
                  nn.Dropout(0.5),
                )
  def transform(self, x):
    x = self.f(x)
    return x
  def kernel(self, x):
    pass
  def forward(self, x):
    f = torch.matmul(self.transform(x), self.w.T)
    #f = torch.matmul(x, self.w.T)
    return f

In [29]:
class HingeLoss(nn.Module):
  def __init__(self, C):
    super(HingeLoss, self).__init__()  
    self.C = C
  def forward(self, y, f):
    # define Hinge loss
    hinge_loss = torch.mean(nn.functional.relu(-y*f))
    return hinge_loss


In [5]:
# X = pd.read_csv("X_train.csv")  
# y = pd.read_csv("Y_train.csv", header = None).values.reshape(-1)
# selector = SelectKBest(chi2, k=60)
# selector.fit(X, y)
# cols = selector.get_support(indices=True)
# X_new = X.iloc[:,cols]
# KEYS = X_new.keys()
# KEYS=KEYS.tolist()
# print(KEYS)
# dele = [' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th',' 10th',' 11th',' 12th', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college' \
#         ,' Divorced', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed']
# for word in dele: 
#     KEYS.remove(word)
# KEYS.append('edu_degree')
# KEYS.append('marriage')
KEYS =['age', 'fnlwgt', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', ' Federal-gov', ' Local-gov', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', '?_workclass', ' Adm-clerical', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving', '?_occupation', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife', ' Amer-Indian-Eskimo', ' Black', ' Other', ' White', ' Columbia', ' Dominican-Republic', ' El-Salvador', ' India', ' Mexico', 'edu_degree', 'marriage']

In [30]:
class TrainDataset(Dataset):
  def __init__(self, split, mu=None, std=None):
    X = pd.read_csv(f"{split}.csv")
    Y = X['y'].values.reshape(-1) * 2 - 1
    X.pop('y')
    X = self.education_redesign(X)
    X = self.marriage_redesign(X)
    X,non,non = self.normalize(X,True,mu,std)
    X = self.selection(X,KEYS)
    X = pd.DataFrame.from_dict({k: X[k] for k in KEYS})
    X = np.concatenate((X, np.ones((X.shape[0], 1))), 1)
    self.Y = torch.from_numpy(Y).to(torch.float32)
    print('Y',self.Y.size())
    self.X = torch.from_numpy(X).to(torch.float32)
    print('X',self.X.size())
  def normalize(self,X,flag=False, mu_x=None, std_x=None):
    #pass
    if flag==False:
        mu_x= []
        std_x =[]
    else:
        count=0
    for key in ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss',"edu_degree"]:
        X[key]=(X[key]-X[key].mean())/X[key].std()
#         if  flag==False:
#             mu_x.append(X[key].mean())
#             std_x.append(X[key].std())
#             X[key]=(X[key]-X[key].mean())/X[key].std()
#         else:
#             X[key]=(X[key]-mu_x[count])/std_x[count]           
        #count+=1
    return X, mu_x, std_x
  def education_redesign(self,X):
    X["edu_degree"]= X[" HS-grad"]*1+ X[" Bachelors"]*2+ X[" Some-college"]*2+ X[" Assoc-acdm"]*3+ X[" Assoc-voc"]*3 \
    + X[" Masters"]*4+ X[" Prof-school"]*5+ X[" Doctorate"]*5 
    return X
  def marriage_redesign(self,X):
      X["marriage"]= X[' Married-civ-spouse']*1+ X[' Married-spouse-absent']*1+ X[' Married-AF-spouse']*1+ X[" Divorced"]*0+ X[" Separated"]*0 \
      + X[" Widowed"]*0+ X[" Never-married"]*-1 
      return X
  def selection(self,X,keys):
    selX = pd.DataFrame.from_dict({k: X[k] for k in keys})
    return selX
  def __len__(self):
    return self.X.size(0)

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

class TestDataset(Dataset):
  def __init__(self, mu= None, std=None):
    X = pd.read_csv("X_test")
    X = self.education_redesign(X)
    X = self.marriage_redesign(X)
    X = self.normalize(X)
    X = self.selection(X,KEYS)
    X = pd.DataFrame.from_dict({k: X[k] for k in KEYS})
    X = np.concatenate((X, np.ones((X.shape[0], 1))), 1)
    self.X = torch.from_numpy(X).to(torch.float32)

  def normalize(self,X,flag=False):
    #pass
    if flag==False:
        mu_x= []
        std_x =[]
    else:
        count=0
    for key in ['age', 'fnlwgt', 'hours_per_week', 'capital_gain', 'capital_loss',"edu_degree"]:
        X[key]=(X[key]-X[key].mean())/X[key].std()
#         if  flag==False:
#             mu_x.append(X[key].mean())
#             std_x.append(X[key].std())
#             X[key]=(X[key]-X[key].mean())/X[key].std()
#         else:
#             X[key]=(X[key]-mu_x[count])/std_x[count   
        #count+=1
    return X
  def education_redesign(self,X):
    X["edu_degree"]= X[" HS-grad"]*1+ X[" Bachelors"]*2+ X[" Some-college"]*2+ X[" Assoc-acdm"]*3+ X[" Assoc-voc"]*3 \
    + X[" Masters"]*4+ X[" Prof-school"]*5+ X[" Doctorate"]*5 
    return X
  def marriage_redesign(self,X):
      X["marriage"]= X[' Married-civ-spouse']*1+ X[' Married-spouse-absent']*1+ X[' Married-AF-spouse']*1+ X[" Divorced"]*0+ X[" Separated"]*0 \
      + X[" Widowed"]*0+ X[" Never-married"]*-1 
      return X
  def selection(self,X,keys):
    selX = pd.DataFrame.from_dict({k: X[k] for k in keys})
    return selX
  def __len__(self):
    return self.X.size(0)

  def __getitem__(self, idx):
    return self.X[idx]

In [31]:
def train(train_data, val_data, model, optim, C, device='cuda:0'):
    epoch = 20
    objective = HingeLoss(C)
    objective = objective.to(device)
    steps = 0
    best = 0
    preds1 = None

    for e in range(epoch):
      # TODO : try to implement gradient descent
      #acct = []
      for tr in train_data:
        steps += 1
        x_train, y_train = tr
        x_train, y_train = x_train.to(device), y_train.to(device)
        pred = model(x_train).squeeze(1)
        loss = objective(pred, y_train) + 1 / 2 * torch.sum(model.w[:-1] ** 2)
        optim.zero_grad()
        loss.backward()
        optim.step()

        
        if steps % 100 == 0:
          model.eval()
          with torch.no_grad():
            acc = []
            preds = []
            for val in val_data:
              x_val, y_val = val
              x_val , y_val = x_val.to(device), y_val.to(device)
              pred = model(x_val).squeeze(1)
              pred = (pred > 0) * 2 - 1
              result = (y_val.item() == pred)
              #preds += [pred.item()]
              acc += [(float(result.sum()) / result.size(0))]
            #if preds1 != preds:
            #  print('change')
            #preds1 = preds 
            #print(preds)
            acc = max(sum(acc) / len(acc),1-sum(acc) / len(acc))
            #Acct = sum(acct) / len(acct)
            print(f'Steps {steps}| Train Loss = {loss.item()}| Val acc = {acc}')
            if acc > best:
              torch.save(model.state_dict(), 'best_n.ckpt')
              best = acc        
          model.train()
    return model

In [32]:
lr = 0.001
batch = 32
C = 1#1
device = 'cpu'

In [33]:
trainset = TrainDataset('train')
devset = TrainDataset('val')
testset = TestDataset()

  X["edu_degree"]= X[" HS-grad"]*1+ X[" Bachelors"]*2+ X[" Some-college"]*2+ X[" Assoc-acdm"]*3+ X[" Assoc-voc"]*3 \
  X["marriage"]= X[' Married-civ-spouse']*1+ X[' Married-spouse-absent']*1+ X[' Married-AF-spouse']*1+ X[" Divorced"]*0+ X[" Separated"]*0 \


Y torch.Size([29561])
X torch.Size([29561, 43])
Y torch.Size([3000])
X torch.Size([3000, 43])


In [34]:
trainset = TrainDataset('train')
devset = TrainDataset('val')
testset = TestDataset()

train_dataloader = DataLoader(trainset, batch, True, drop_last=False)
val_dataloader = DataLoader(devset, 1, False)
test_dataloader = DataLoader(testset, 1, False)

model = SVM().to(device) 
#model.load_state_dict(torch.load('best_n.ckpt'))
optim = SGD(model.parameters(), lr)
model = train(train_dataloader, val_dataloader, model, optim, C, device)

  X["edu_degree"]= X[" HS-grad"]*1+ X[" Bachelors"]*2+ X[" Some-college"]*2+ X[" Assoc-acdm"]*3+ X[" Assoc-voc"]*3 \
  X["marriage"]= X[' Married-civ-spouse']*1+ X[' Married-spouse-absent']*1+ X[' Married-AF-spouse']*1+ X[" Divorced"]*0+ X[" Separated"]*0 \


Y torch.Size([29561])
X torch.Size([29561, 43])
Y torch.Size([3000])
X torch.Size([3000, 43])
Steps 100| Train Loss = 0.11479207873344421| Val acc = 0.6196666666666667
Steps 200| Train Loss = 0.0829077735543251| Val acc = 0.679
Steps 300| Train Loss = 0.054673463106155396| Val acc = 0.7133333333333334
Steps 400| Train Loss = 0.10333210229873657| Val acc = 0.7376666666666667
Steps 500| Train Loss = 0.034306082874536514| Val acc = 0.749
Steps 600| Train Loss = 0.09487834572792053| Val acc = 0.7613333333333333
Steps 700| Train Loss = 0.07967253774404526| Val acc = 0.7643333333333333
Steps 800| Train Loss = 0.10057765245437622| Val acc = 0.772
Steps 900| Train Loss = 0.0489596351981163| Val acc = 0.7773333333333333
Steps 1000| Train Loss = 0.06218215450644493| Val acc = 0.778
Steps 1100| Train Loss = 0.030485931783914566| Val acc = 0.7813333333333333
Steps 1200| Train Loss = 0.03902481123805046| Val acc = 0.784
Steps 1300| Train Loss = 0.08035322278738022| Val acc = 0.7863333333333333
Step

In [35]:
best_model = model
best_model.load_state_dict(torch.load('best_n.ckpt'))
best_model = best_model.eval()
# TODO: predict x_test
y_test = []
for x in test_dataloader:
  x = x.to(device)
  y = best_model(x)
  y_test.append(((y > 0) * 1).item())



In [36]:
import csv
with open('predict_4.csv', 'w', newline='') as csvf:
    # 建立 CSV 檔寫入器
    writer = csv.writer(csvf)
    writer.writerow(['id','label'])
    for i in range(len(y_test)):
      writer.writerow( [i + 1, int(y_test[i])] )