In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import torch.nn.functional as F
import copy

In [1]:
import pandas as pd

data = pd.read_csv('train.csv')
data['SalePrice'].describe()

D:\Anaconda\envs\harryqi\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
D:\Anaconda\envs\harryqi\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll


count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [3]:
class FcNet(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_classes, dropout=0.2):
        super(FcNet, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size*74, hidden_size) # 74 feature num
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.act_func = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = self.embedding(x)
        out = out.view(-1, 22*self.embedding_size)
        out = self.fc1(out)
        out = self.act_func(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.act_func(out)
        out = self.dropout(out)
        out = self.fc3(out)
        out = F.softmax(out, dim=-1)
        return out

In [14]:
def load_data(x_path, y_path=None):
    x_df = pd.read_csv(x_path)
    X = []
    for idx, row in x_df.iterrows():
        # if idx == 1000:
        #     break
        feature = []
        for col_key in row.keys():
            if col_key != "index":
                feature.append(row[col_key])
        X.append(feature)
    if y_path is None:
        return X, None

    y_df = pd.read_csv(y_path)
    Y = []
    for idx, row in y_df.iterrows():
        # if idx == 1000:
        #     break
        Y.append(row["SalePrice"])
    return X, Y

In [5]:
def word_count(X):
    dic = {}
    for x in X:
        for word in x:
            if(word in dic):
                dic[word] += 1
            else:
                dic[word] = 1
    word_count_sorted = sorted(dic.items(), key=lambda item:item[1], reverse=True)
    return  word_count_sorted

In [6]:
def word_index(X, vocab_size):
    word_count_sorted = word_count(X)
    word2index = {}
    word2index["<unk>"] = 0
    word2index["<pad>"] = 1
    vocab_size = min(len(word_count_sorted), vocab_size)
    for i in range(vocab_size):
        word = word_count_sorted[i][0]
        word2index[word] = i + 2
    return word2index, vocab_size + 2

In [7]:
def get_feature(X, word2index):
    features = []
    for x in X:
        feature = []
        for word in x:
            if word in word2index:
                feature.append(word2index[word])
            else:
                feature.append(word2index["<unk>"])
        features.append(feature)
    return features

In [8]:
def create_dataset(X, Y=None, batch_size=128, do_shuffle=True):
    num_batchs = int((len(X) - 1) / batch_size) + 1
    indices = [idx for idx in range(len(X))]
    if do_shuffle:
        random.shuffle(indices)
    batchs = []
    for batch_num in range(num_batchs):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, len(X))
        batch_x = []
        batch_y = []
        for idx in range(start_index, end_index):
            batch_x.append(X[indices[idx]])
            if Y is not None:
                label = [0, 0]
                label[Y[indices[idx]]] = 1 # one hot label
                batch_y.append(label)
        batchs.append((batch_x, batch_y))
    return batchs

In [9]:
def split_dev(dataset):
    train_set = []
    dev_set = []
    for data in dataset:
        if random.random() > 0.2:
            train_set.append(data)
        else:
            dev_set.append(data)
    return train_set, dev_set

In [10]:
def test(model, dev_set, loss_func):
    loss_val = 0.
    data_size = 0
    corrects = 0
    for batch_x, batch_y in train_set:
        datas = torch.LongTensor(batch_x)
        labels = torch.FloatTensor(batch_y)

        preds = model(datas)
        loss = loss_func(preds, labels)

        loss_val += loss.item() * datas.size(0)
        data_size += datas.size(0)
        
        preds = torch.argmax(preds, dim=1)
        labels = torch.argmax(labels, dim=1)
        corrects += torch.sum(preds == labels).item()

    dev_loss = loss_val / (data_size + (1e-10))
    dev_acc = corrects / (data_size + (1e-10))
    print("Dev Loss: {}, Dev Acc: {}".format(dev_loss, dev_acc))
    return dev_acc

In [11]:
def train(train_set, dev_set, vocab_size):
    model = FcNet(vocab_size, 16, 32, 2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
    loss_func = nn.BCELoss()

    best_val_acc = 0.
    best_model_params = copy.deepcopy(model.state_dict())
    for epoch in range(1000):
        loss_val = 0.
        data_size = 0
        corrects = 0
        for batch_x, batch_y in train_set:
            datas = torch.LongTensor(batch_x)
            labels = torch.FloatTensor(batch_y)

            preds = model(datas)
            loss = loss_func(preds, labels)
            l2_lambda = 0.0001
            l2_reg = torch.tensor(0.)
            for param in model.parameters():
                l2_reg += torch.norm(param)
            loss += l2_lambda * l2_reg

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_val += loss.item() * datas.size(0)
            data_size += datas.size(0)
            
            preds = torch.argmax(preds, dim=1)
            labels = torch.argmax(labels, dim=1)
            corrects += torch.sum(preds == labels).item()

        train_loss = loss_val / (data_size + (1e-10))
        train_acc = corrects / (data_size + (1e-10))
        if epoch % 100 == 0:
            print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
            test_acc = test(model, dev_set, loss_func)
            if(best_val_acc < test_acc):
                best_val_acc = test_acc
                best_model_params = copy.deepcopy(model.state_dict()) # 更新最优参数
    model.load_state_dict(best_model_params)
    return model

In [12]:
def infer(model, test_set):
    results = [] 
    for batch_x, _ in test_set:
        datas = torch.LongTensor(batch_x)
        preds = model(datas)
        preds = torch.argmax(preds, dim=1)
        results += preds.tolist()
    return results

def post_process(file_path, results):
    df = pd.read_csv(file_path)
    df["pred"] = results
    df.to_csv("./results.csv")

In [15]:
if __name__ == "__main__":
    do_train = True
    if do_train:
        X, Y = load_data("train_x.csv", "train_y.csv")
        word2index, vocab_size = word_index(X, 2000)
        X = get_feature(X, word2index)
        dataset = create_dataset(X, Y, 64)
        train_set, dev_set = split_dev(dataset)
        model = train(train_set, dev_set, vocab_size)
        torch.save(model.state_dict(), "./fc_net.pkl")
        print("train_size: ", len(train_set))
        print("dev size: ", len(dev_set))
    else:
        X_train, _ = load_data("./x_train.csv")
        word2index, vocab_size = word_index(X_train, 2000)
        X_test, _ = load_data("./test_x.csv")
        X_test = get_feature(X_test, word2index)
        test_set = create_dataset(X_test, do_shuffle=False)
        model = FcNet(vocab_size, 64, 128, 2)
        model.load_state_dict(torch.load("./fc_net.pkl"))
        results = infer(model, test_set)
        post_process("./test_x.csv", results)

IndexError: list assignment index out of range