In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

torch.manual_seed(0)

<torch._C.Generator at 0x1257b2650>

In [2]:
data = pd.read_csv("YearPredictionMSD.csv", nrows = 50000, header = None)
print(data.head())
print(data.shape)

     0         1         2         3         4         5         6         7   \
0  2001  49.94357  21.47114  73.07750   8.74861 -17.40628 -13.09905 -25.01202   
1  2001  48.73215  18.42930  70.32679  12.94636 -10.32437 -24.83777   8.76630   
2  2001  50.95714  31.85602  55.81851  13.41693  -6.57898 -18.54940  -3.27872   
3  2001  48.24750  -1.89837  36.29772   2.58776   0.97170 -26.21683   5.05097   
4  2001  50.97020  42.20998  67.09964   8.46791 -15.85279 -16.81409 -12.48207   

         8         9   ...        81         82        83        84        85  \
0 -12.23257   7.83089  ...  13.01620  -54.40548  58.99367  15.37344   1.11144   
1  -0.92019  18.76548  ...   5.66812  -19.68073  33.04964  42.87836  -9.90378   
2  -2.35035  16.07017  ...   3.03800   26.05866 -50.92779  10.93792  -0.07568   
3 -10.34124   3.55005  ...  34.57337 -171.70734 -16.96705 -46.67617 -12.51516   
4  -9.37636  12.63699  ...   9.92661  -55.95724  64.92712 -17.72522  -1.49237   

         86         87    

In [3]:
cols = data.columns  # data數據集的所有行數
num_cols = data._get_numeric_data().columns  # data數據集中所有為數值型態的行數
print(list(set(cols) - set(num_cols)))

[]


In [4]:
print(data.isnull().sum().sum())

0


In [5]:
outliers = []
for i in range(data.shape[1]):
    min_t = data[data.columns[i]].mean() - (3 * data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 * data[data.columns[i]].std())
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
    percentage = count / data.shape[0]
    #print(i, percentage)
    if percentage > 0.05:
        outliers.append(i)

print(outliers)

[]


In [6]:
x = data.iloc[:,2:]
y = data.iloc[:,0]

In [7]:
x = (x - x.mean()) / x.std()
print(x.head())

         2         3         4         5         6         7         8   \
0  0.382437  1.841985  0.459652 -0.480074 -0.282606 -1.590785 -1.300854   
1  0.321953  1.763666  0.717085 -0.165507 -1.188896  0.777905  0.122576   
2  0.588929  1.350579  0.745944  0.000857 -0.703401 -0.066747 -0.057380   
3 -0.082240  0.794774  0.081829  0.336246 -1.295366  0.517369 -1.062869   
4  0.794806  1.671781  0.442438 -0.411071 -0.569426 -0.712128 -0.941459   

         9         10        11  ...        81        82        83        84  \
0  0.378336 -0.683719  0.791667  ... -0.086005  0.099339  0.148291 -0.255625   
1  1.420531  0.401198  0.541262  ... -0.316635  0.301448 -0.063611  0.031855   
2  1.163637 -0.090081  0.658570  ... -0.399185  0.567666 -0.749508 -0.301984   
3 -0.029679 -1.282306  1.543411  ...  0.590596 -0.583396 -0.472129 -0.904164   
4  0.836414 -0.160630  0.402680  ... -0.182976  0.090307  0.196753 -0.601570   

         85        86        87        88        89        90  
0  0

In [8]:
# 拆分數據成 2 個子集，x_new: x_test = 70: 30
x_new, x_test, y_new, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# 再拆分數據集 x_new 成 2 個子集, x_train: x_dev = 75: 25
x_train, x_dev, y_train, y_dev = train_test_split(x_new, y_new, test_size = 0.25, random_state = 0)

print(x_train.shape, x_dev.shape, x_test.shape)

(26250, 89) (8750, 89) (15000, 89)


In [9]:
x_train_torch = torch.tensor(x_train.values).float()
y_train_torch = torch.tensor(y_train.values).float().unsqueeze(1)
x_dev_torch = torch.tensor(x_dev.values).float()
y_dev_torch = torch.tensor(y_dev.values).float().unsqueeze(1)
x_test_torch = torch.tensor(x_test.values).float()
y_test_torch = torch.tensor(y_test.values).float().unsqueeze(1)

print(x_train_torch.shape, y_train_torch.shape)

torch.Size([26250, 89]) torch.Size([26250, 1])


In [10]:
model = nn.Sequential(
    nn.Linear(x_train.shape[1], 200),
    nn.ReLU(),
    nn.Linear(200, 50),
    nn.ReLU(),
    nn.Linear(50, 25),
    nn.ReLU(),
    nn.Linear(25, 1)
)

In [11]:
device = "mps"
model = model.to(device)
x_train_torch = x_train_torch.to(device)
y_train_torch = y_train_torch.to(device)
x_dev_torch = x_dev_torch.to(device)
y_dev_torch = y_dev_torch.to(device)

In [12]:
myloss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [13]:
epochs = 8001
for epoch in range(epochs):
    model.train()
    y_pred = model(x_train_torch)
    train_loss = myloss(y_pred, y_train_torch)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    if epoch % 400 == 0:
        with torch.no_grad():
            model.eval()
            y_pred2 = model(x_dev_torch)
            valid_loss = myloss(y_pred2, y_dev_torch)

        print(f"epoch = {epoch}, train_loss: {train_loss.item():.3f}, valid_loss: {valid_loss.item():.3f}")

        if train_loss.item() < 81:
            break


epoch = 0, train_loss: 3993293.750, valid_loss: 3993412.250
epoch = 400, train_loss: 169243.766, valid_loss: 181055.047
epoch = 800, train_loss: 13761.353, valid_loss: 22696.143
epoch = 1200, train_loss: 1468.064, valid_loss: 4172.907
epoch = 1600, train_loss: 496.691, valid_loss: 1878.264
epoch = 2000, train_loss: 245.756, valid_loss: 1048.438
epoch = 2400, train_loss: 145.169, valid_loss: 644.123
epoch = 2800, train_loss: 104.798, valid_loss: 464.467
epoch = 3200, train_loss: 107.529, valid_loss: 429.766
epoch = 3600, train_loss: 86.021, valid_loss: 369.196
epoch = 4000, train_loss: 82.220, valid_loss: 348.441
epoch = 4400, train_loss: 79.452, valid_loss: 333.373


In [14]:
model = model.to("cpu")
pred = model(x_test_torch)
test_loss = myloss(pred,y_test_torch)
print(f"test_loss: {test_loss.item():.3f}")

test_loss: 486.630


In [15]:
for i in range(100,110):
    print(f"truth: {y_test_torch[i].item():.0f}, pred: {pred[i].item():.0f}")

truth: 2006, pred: 2000
truth: 2003, pred: 1998
truth: 2002, pred: 1998
truth: 2005, pred: 2149
truth: 2007, pred: 2000
truth: 2008, pred: 1994
truth: 2001, pred: 2007
truth: 2006, pred: 1998
truth: 2008, pred: 1999
truth: 1966, pred: 1992


In [16]:
def accuracy():
    temp = 0
    for i in range(len(pred)):
        truth_value = y_test_torch[i].item()
        if truth_value - 2 <= pred[i].item() <= truth_value + 2:
            temp += 1
            
    print(f"Accuracy: {round(temp / 5000, 2)}")

accuracy()

Accuracy: 0.54
