In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import time
import os
import gc
import nltk

np.random.seed(42)

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('products_sentiment_train.tsv', delimiter='\t', header=None, names=['Reviews', 'Target'])
df.head(5)

Unnamed: 0,Reviews,Target
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [3]:
df.isnull().values.any()

False

In [7]:
print ('Number of docs in session: {}'.format(len(df)))
np.bincount(df['Target'])

Number of docs in session: 2000


array([ 726, 1274])

In [511]:
# оп, у нас несбалансированная по таргету выборка. Сделаем stratify по таргету в train_test_split как минимум.
# Будем пробовать SVM
# Еще добавим наивный байесовский классификатор ну и конечно RF :)
# Где сможем, попробуем class_weight='balanced', штрафующий меньше за ошибки в минорном классе.

In [512]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [8]:
X_train = df['Reviews']
y = df['Target']

In [9]:
X_test = pd.read_csv('products_sentiment_test.tsv', sep='\t', usecols=[1])
X_test.head(5)

Unnamed: 0,text
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."


In [10]:
X_test = X_test['text']

In [460]:
cv = CountVectorizer()
X = cv.fit_transform(reviews)

In [203]:
LR = LogisticRegression(random_state=42, n_jobs=-1)

In [214]:
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

In [461]:
np.mean(cross_val_score(LR, X, y, scoring = 'accuracy', cv=rskf))

0.7668011681323008

In [462]:
svm = SVC(kernel='linear', random_state=42)
np.mean(cross_val_score(svm, X, y, scoring = 'accuracy', cv=rskf))

0.7491495034343965

In [224]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB

In [463]:
MNB = MultinomialNB()
np.mean(cross_val_score(MNB, X, y, scoring = 'accuracy', cv=rskf))

0.7794979449871562

In [229]:
CNB = ComplementNB()
np.mean(cross_val_score(CNB, X, y, scoring = 'accuracy', cv=rskf))

0.7775475652972831

In [233]:
knc = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
np.mean(cross_val_score(knc, X, y, scoring = 'accuracy', cv=5))

0.6624853905336908

In [378]:
sgd = SGDClassifier(random_state=42, n_jobs=-1, loss='hinge', penalty='l2')
np.mean(cross_val_score(sgd, X, y, scoring = 'accuracy', cv=rskf))

0.7421381074256714

In [287]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(reviews)

In [235]:
MNB = MultinomialNB()
np.mean(cross_val_score(MNB, X, y, scoring = 'accuracy', cv=rskf))

0.7041493865586658

In [None]:
#наивный байесовский ожидаемо хуже с TFIDF

In [236]:
svm = SVC(kernel='linear', random_state=42)
np.mean(cross_val_score(svm, X, y, scoring = 'accuracy', cv=rskf))

0.7688974299839373

In [288]:
np.std(cross_val_score(svm, X, y, scoring = 'accuracy', cv=rskf))

0.020652658703663684

In [237]:
np.mean(cross_val_score(LR, X, y, scoring = 'accuracy', cv=rskf))

0.7666507946924668

In [243]:
RF = RandomForestClassifier()
np.mean(cross_val_score(RF, X, y, scoring = 'accuracy', cv=rskf))

0.7101540281501759

In [None]:
#переберем гиперпараметры для svm и посмотрим, что сможем выжать из NB с countvectorizer-ом

In [194]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.8, stratify=y)

In [459]:
PipeNB = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('mnb', MultinomialNB())
    ])

In [282]:
from nltk.corpus import stopwords

In [284]:
param_grid = [{'vectorizer__min_df':[1, 2, 3, 4, 5],
               'vectorizer__ngram_range':[(1, 2),(1, 3)]
              }]
GS = GridSearchCV(PipeNB, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

({'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 2)}, 0.7865)

In [291]:
PipeSVM = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('svm', SVC(random_state=42))
    ])

In [292]:
param_grid = [{'vectorizer__min_df':[1, 2, 3, 4, 5],
               'vectorizer__ngram_range':[(1, 2),(1, 3)],
               'svm__C':[0.1, 0.5, 1],
               'svm__kernel':['linear', 'poly', 'rbf']
              }]
GS = GridSearchCV(PipeSVM, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

({'svm__C': 1,
  'svm__kernel': 'linear',
  'vectorizer__min_df': 1,
  'vectorizer__ngram_range': (1, 3)},
 0.7835)

In [458]:
PipeSVM = Pipeline([
        ('vectorizer', TfidfVectorizer(analyzer='char_wb', min_df=2, stop_words='english')),
        ('svm', SVC(kernel='linear', random_state=42, C=0.5))
    ])
param_grid = {'vectorizer__ngram_range':[(2,5), (3,5)]}
GS = GridSearchCV(PipeSVM, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

({'vectorizer__ngram_range': (2, 5)}, 0.7915)

In [328]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [338]:
En_stemmer = SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([En_stemmer.stem(w) for w in analyzer(doc)])

In [339]:
PipeNB = Pipeline([
        ('vectorizer', StemmedCountVectorizer(min_df=2, analyzer="word", stop_words='english')),
        ('mnb', MultinomialNB())
    ])
PipeSVM = Pipeline([
        ('vectorizer', StemmedCountVectorizer(min_df=2, analyzer="word", stop_words='english')),
        ('svm', SVC(kernel='linear', random_state=42, C=0.5))
    ])

In [334]:
param_grid = {'vectorizer__ngram_range':[(1, 2),(1, 3)]}
GS = GridSearchCV(PipeSVM, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

({'vectorizer__ngram_range': (1, 3)}, 0.7295)

In [340]:
param_grid = {'vectorizer__ngram_range':[(1, 2),(1, 3)]}
GS = GridSearchCV(PipeNB, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

({'vectorizer__ngram_range': (1, 3)}, 0.7585)

In [None]:
PipeSVM = Pipeline([
        ('vectorizer', TfidfVectorizer(analyzer='char_wb', min_df=2, stop_words='english')),
        ('svm', SVC(kernel='linear', random_state=42, C=0.5))
    ])
param_grid = {'vectorizer__ngram_range':[(2,5), (3,5)]}
GS = GridSearchCV(PipeSVM, param_grid = param_grid, cv=5)
GS.fit(reviews, y)
GS.best_params_, GS.best_score_

In [410]:
PipeSVM = Pipeline([
        ('vectorizer', TfidfVectorizer(analyzer='char_wb', min_df=2, ngram_range=(2,5), stop_words='english')),
        ('svm', SVC(kernel='linear', random_state=42, C=0.5))
    ])

In [412]:
PipeSVM.fit(reviews, y)
prediction_svm_charwb = PipeSVM.predict(X_test)

In [415]:
submission = pd.read_csv('products_sentiment_sample_submission.csv')
submission.head(5)

Unnamed: 0,Id,y
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0


In [417]:
submission = pd.read_csv('products_sentiment_sample_submission.csv')
submission['y'] = prediction_svm_charwb
submission.to_csv('svm_charwb.csv', index = None)

submission.head()

Unnamed: 0,Id,y
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [None]:
# годится, теперь немного advanced

In [11]:
def preprocess(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [12]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)

In [13]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Using TensorFlow backend.


In [14]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [15]:
CRAWL_EMBEDDING_PATH = '/home/hq/git/crawl-300d-2M.vec'
NUM_MODELS = 1
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

In [16]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [17]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
    
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [18]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [19]:
max_features = None

In [20]:
tokenizer = text.Tokenizer(lower=False)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

In [21]:
max_features = max_features or len(tokenizer.word_index) + 1
max_features

4517

In [23]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


n unknown words (crawl):  97


In [24]:
torch.cuda.is_available()

False

In [25]:
X_train_torch = torch.tensor(X_train, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_train_torch = torch.tensor(y, dtype=torch.float32)

In [63]:
y_train_torch.shape

torch.Size([2000])

In [74]:
y.shape[-1]

2000

In [37]:
model.shape

AttributeError: 'NeuralNet' object has no attribute 'shape'

In [None]:
train_dataset = data.TensorDataset(X_train_torch, y_train_torch)
test_dataset = data.TensorDataset(X_test_torch)

all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(42 + model_idx)
    
    model = NeuralNet(crawl_matrix, y.shape[-1])
    model
    
    test_preds = train_model(model, train_dataset, test_dataset, output_dim=y.shape[-1],
                             loss_fn=nn.BCEWithLogitsLoss(reduction='sum'))
    all_test_preds.append(test_preds)
    print()

Model  0


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))