In [2]:
import pandas as pd
pd.set_option('display.max_colwidth',None)

In [3]:
import torch

In [4]:
from torch import nn

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re

In [6]:
# nltk.download('wordnet')

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# data

In [8]:
df = pd.read_csv('./sentence_ids.csv')

In [9]:
test = pd.read_csv('./test.tsv',sep='\t')

In [10]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,64,2,"This quiet , introspective and entertaining independent is worth seeking .",4
2,82,3,"Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .",1
3,117,4,"A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .",3
4,157,5,Aggressive self-glorification and a manipulative whitewash .,1


In [11]:
df_train = df.drop(['PhraseId', 'SentenceId'], axis = 1).reset_index(drop = True)
df_train.head()

Unnamed: 0,Phrase,Sentiment
0,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,"This quiet , introspective and entertaining independent is worth seeking .",4
2,"Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .",1
3,"A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .",3
4,Aggressive self-glorification and a manipulative whitewash .,1


In [12]:
df_train['Sentiment'].value_counts()

Sentiment
1    114
3    105
2     90
4     64
0     47
Name: count, dtype: int64

0 - negative

1 - somewhat negative

2 - neutral

3 - somewhat positive

4 - positive

### clean

In [13]:
def remove_stopwords(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
def Clean(corpus):
    cleaned = []
    for line in corpus:
        clean_line = BeautifulSoup(line).get_text() # 去除HTML标签
        clean_line = re.sub("[^a-zA-Z]"," ", clean_line) # 去除除字母外（数字、符号
        # clean_line = remove_stopwords(clean_line)
        words = word_tokenize(clean_line.lower()) # 分词
        # lemma_words = [lemmatizer.lemmatize(w) for w in words] # 统一时态
        
        cleaned.append(words)
    return cleaned

In [14]:
corpus = []
corpus.extend(line for line in df_train['Phrase'])
cleaned_corpus = Clean(corpus)

  clean_line = BeautifulSoup(line).get_text() # 去除HTML标签


### df_train['Phrase_clean']

In [15]:
df_train['Phrase_clean'] = cleaned_corpus
df_train.head()

Unnamed: 0,Phrase,Sentiment,Phrase_clean
0,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,"[a, series, of, escapades, demonstrating, the, adage, that, what, is, good, for, the, goose, is, also, good, for, the, gander, some, of, which, occasionally, amuses, but, none, of, which, amounts, to, much, of, a, story]"
1,"This quiet , introspective and entertaining independent is worth seeking .",4,"[this, quiet, introspective, and, entertaining, independent, is, worth, seeking]"
2,"Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one .",1,"[even, fans, of, ismail, merchant, s, work, i, suspect, would, have, a, hard, time, sitting, through, this, one]"
3,"A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera .",3,"[a, positively, thrilling, combination, of, ethnography, and, all, the, intrigue, betrayal, deceit, and, murder, of, a, shakespearean, tragedy, or, a, juicy, soap, opera]"
4,Aggressive self-glorification and a manipulative whitewash .,1,"[aggressive, self, glorification, and, a, manipulative, whitewash]"


### vocab

In [16]:
vocab = [word for words in cleaned_corpus for word in words]
vocab = set(vocab)
len(vocab)

2546

word and index

In [17]:
word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}

### train_validation

In [18]:
from sklearn.model_selection import train_test_split

In [21]:
from torch.nn import functional as F

In [None]:
x = df_train['Phrase_clean']
y = df_train['Sentiment']

In [None]:
target = df_train.Sentiment.values
y = F.one_hot(,target)

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x, y, test_size=0.2, stratify=y)
# 参数 stratify 用于实现分层采样

In [None]:
y_train.shape

# model

In [None]:
def get_params(vocab_size, hidden_size):
    
    def normal(shape):
        return torch.randn(size=shape)*0.01

    # hidden layer
    W_vh = normal((vocab_size, hidden_size))
    W_hh = normal((hidden_size, hidden_size))
    b_h = torch.zeros(hidden_size)
    # output
    U_hv = normal((hidden_size, vocab_size))
    b_v = torch.zeros(vocab_size)
    
    # grad
    params = [W_vh, W_hh, b_h, U_hv, b_v]
    for p in params:
        p.requires_grad_(True)
    return params

In [None]:
def init_rnn_state(batch_size, hidden_size):
    return (torch.zeros((batch_size, hidden_size)))

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size = 128):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size = vocab_size, hidden_size = hidden_size)
        
        # 输出层参数
        self.linear_out = nn.Linear(hidden_size, vocab_size)
        self.softmax_function = nn.LogSoftmax(dim = -1)

    def forward(self, h, x):
        # x = x.transpose(0, 1)
        outputs, h = self.rnn(x, h)
        # outputs: (timestamp, batch_size, direc(one dire = 1)*vocab_size)
        # h: (layers(=1) * direc, batch_size, hidden_size)
        
        outputs = outputs[-1]
        out = self.linear_out(outputs)
        out = self.self.softmax_function(dim = -1)
        
        return out

## train