### 基于逻辑回归与词袋模式的文本分类器

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x1bcb96ba890>

In [50]:
lin = nn.Linear(5,3)
data = torch.randn(2,5)
data

tensor([[-1.1948,  0.0250, -0.7627,  1.3969, -0.3245],
        [ 0.2879,  1.0579,  0.9621,  0.3935,  1.1322]])

In [51]:
lin(data)

tensor([[ 0.1755, -0.3268, -0.5069],
        [-0.6602,  0.2260,  0.1089]], grad_fn=<AddmmBackward>)

In [52]:
data = torch.randn(2,2)
print(data)

tensor([[-0.5404, -2.2102],
        [ 2.1130, -0.0040]])


### 1.3softmax 函数的利用率

In [53]:
F.softmax(data)

  F.softmax(data)


tensor([[0.8416, 0.1584],
        [0.8925, 0.1075]])

In [54]:
F.softmax(data,dim =0)  #dim =0 是对列进行归一，dim = 1是对行进行归一化

tensor([[0.0658, 0.0992],
        [0.9342, 0.9008]])

In [55]:
F.log_softmax(data,dim=0)


tensor([[-2.7214, -2.3107],
        [-0.0680, -0.1045]])

### 1.4目标函数

#训练数据data-->前向传播(linear relu 层 网络结构)--> y predict-->y truth 损失 loss/交叉熵
                                                                            <--反向传播

### 词袋模型

In [57]:
data = [("me guesta comer en la cafeteria".split(),"SPANISH"),
       ("Give it to me".split() ,"ENGLISH"),
        ("No creo que ssea una buena idea".split(),"SPANISH"),
       ("No it is not a good idea to get lost at sea","ENGLISH")
       ]
test_data = [("Yo creo que si".split(),"SPANISH"),
            ("it is lost on me ".split(),"ENGLISH")]

In [58]:
"me guesta comer en la cafeteria".split()

['me', 'guesta', 'comer', 'en', 'la', 'cafeteria']

In [59]:
word_to_ix ={ }
for sent,_ in data+test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'me': 0, 'guesta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'ssea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'N': 16, 'o': 17, ' ': 18, 'i': 19, 't': 20, 's': 21, 'n': 22, 'a': 23, 'g': 24, 'd': 25, 'e': 26, 'l': 27, 'Yo': 28, 'si': 29, 'is': 30, 'lost': 31, 'on': 32}


In [61]:
VOCAB_SIZE=len(word_to_ix)
NUM_LABELS =2
model = BoWClassfier(VOCAB_SIZE,NUM_LABELS)
for parm in model.parameters():
    print(parm)

Parameter containing:
tensor([[-0.0924,  0.0512, -0.0503, -0.0191, -0.1674, -0.0830,  0.0945, -0.0423,
          0.1734,  0.1395, -0.0082, -0.1162,  0.1060,  0.0540, -0.1125,  0.1131,
          0.1057,  0.1544, -0.0976, -0.0287, -0.0034,  0.0254, -0.1321, -0.1235,
          0.0947, -0.0408,  0.0850,  0.0099,  0.0572,  0.0383,  0.0633,  0.0863,
         -0.1612],
        [ 0.0876, -0.1224, -0.1313,  0.0106, -0.0297,  0.1022, -0.1008, -0.1548,
          0.1267, -0.0258,  0.0979,  0.0560, -0.1305,  0.0350,  0.0418, -0.1166,
         -0.0826,  0.0594,  0.0312, -0.0740, -0.0527,  0.1594, -0.0322,  0.0981,
          0.0754, -0.1125, -0.1480,  0.1671,  0.0091,  0.1193,  0.0361,  0.0560,
          0.1300]], requires_grad=True)
Parameter containing:
tensor([ 0.1651, -0.1155], requires_grad=True)


In [62]:
model

BoWClassfier(
  (linear): Linear(in_features=33, out_features=2, bias=True)
)

In [67]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)


In [72]:
model

BoWClassifier(
  (linear): Linear(in_features=33, out_features=2, bias=True)
)

In [73]:

with torch.no_grad():
    sample = data[0]
    bow_vector = make_bow_vector(sample[0],word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.8348, -0.5691]])


In [76]:
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

In [81]:
#模型 训练
optimizer = optim.SGD(model.parameters(),lr =0.1)
loss_function = nn.NLLLoss()

for epoch in range(100):
    for instance ,label in data:
        model.zero_grad()
        
        bow_vec = make_bow_vector(instance,word_to_ix)
        target = make_target(label,label_to_ix)
       
        log_probs = model(bow_vec)
        
        loss = loss_function(log_probs,target)
        loss.backward()
        optimizer.step()
        
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

# 对应西班牙语的指数上升，英语下降！
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([[-0.2923, -1.3724]])
tensor([[-1.6829, -0.2056]])
tensor([ 0.4375, -0.4192], grad_fn=<SelectBackward>)


In [82]:
with torch.no_grad():
    sample = test_data[0]
    bow_vec = make_bow_vector(sample[0],word_to_ix)
    log_probs = model(bow_vec)
    print(log_probs)

tensor([[-0.2923, -1.3724]])


In [83]:
print(next(model.parameters()))[:,word_to_ix['creo']]

Parameter containing:
tensor([[-0.1691,  0.6353,  0.6315,  0.6136,  0.3794,  0.3801, -0.8016, -0.6745,
         -0.5228,  0.2348,  0.4375,  0.2484,  0.1811,  0.3078,  0.4451,  0.5064,
         -0.2046, -0.5414, -0.9116, -0.1742, -0.4707, -0.1659,  0.0136, -0.3412,
         -0.3121, -0.3392, -0.3694, -0.1168, -0.1067,  0.0637,  0.0539, -0.0394,
          0.0669],
        [ 0.2472, -0.3991, -0.3882, -0.5644, -0.3353, -0.5255,  0.6903,  0.5320,
          0.5843, -0.4465, -0.4192, -0.2210, -0.4017, -0.4749, -0.2778, -0.2891,
          0.2093,  0.4891,  1.0996,  0.2262,  0.5401,  0.1509,  0.0172,  0.3328,
          0.2675,  0.2249,  0.3326,  0.2703,  0.1697,  0.1187,  0.0055, -0.1204,
          0.1360]], requires_grad=True)


TypeError: 'NoneType' object is not subscriptable