<a href="https://colab.research.google.com/github/MosheWasserb/IMDb/blob/master/NgramLogisticDima.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset definition (for CSV file)

# PyTorch Simple logistic 

### Moshe Wasserblat <br><br>March 2020

### Import 

In [26]:

import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(data.RawField(), LABEL)


print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

print(vars(train_data.examples[0]))


Number of training examples: 25000
Number of testing examples: 25000
{'text': 'This movie has been poorly received and badly reviewed. The book by Rebecca West was written in 1918, soon after WWI, when shell shock and trauma-induced amnesia were not clichés, as the reviewers call it many books and movies later. It is difficult to go back in time and live, as the characters lived, the realities of the time: the war and the horror of the experience of the first war to use lethal gas, the British class system the wife thought all-important, the hopeless spinster, and the lover from the past still seen with the eyes of love being as young and as beautiful as she was 20 years ago.<br /><br />Alan Bates as the amnesiac soldier who "will die" if he isn\'t allowed to see Margaret, the girl of his youthful dreams, builds on the devotion his character showed in "Far From the Madding Crowd". Having seen that performance, it is possible to sense his strong romantic attachment to the girl who didn\

In [0]:
import random as rn

train_texts_full = [e.text for e in train_data]
train_labels_full = [e.label for e in train_data]

test_texts_full = [e.text for e in test_data]
test_labels_full = [e.label for e in test_data]

seed=rn.random()
rn.seed(seed)
rn.shuffle(train_texts_full)
rn.seed(seed)
rn.shuffle(train_labels_full)

seed=rn.random()
rn.seed(seed)
rn.shuffle(test_texts_full)
rn.seed(seed)
rn.shuffle(test_labels_full)

train_texts = train_texts_full[:100]
train_labels = train_labels_full[:100]

test_texts = test_texts_full[:1000]
test_labels = test_labels_full[:1000]



In [51]:
type(train_data)

torchtext.datasets.imdb.IMDB

type(train_data)

In [0]:
#Xdict = vars(train_data.examples[0])
#XdictText, XdicLabel = list(Xdict.values())


PETER: How do I read train_texts & train_labels from train_data?

### Simple logistic

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier



In [73]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)



In [0]:
baseline_predicted = baseline_model.predict(test_texts)

In [75]:

print(classification_report(test_labels, baseline_predicted))

              precision    recall  f1-score   support

         neg       0.70      0.64      0.67       489
         pos       0.68      0.73      0.70       511

   micro avg       0.69      0.69      0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.69      0.69      0.69      1000



### Fine-tune with BERT base on Jay

In [2]:
!pip install transformers
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split





In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [0]:
batch_1 = df[:200]

In [9]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [0]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [85]:
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
6915    [101, 9145, 1010, 7570, 18752, 14116, 1998, 28...
6916    [101, 2202, 2729, 2003, 19957, 2864, 2011, 103...
6917    [101, 1996, 5896, 4472, 4121, 1010, 3082, 7832...
6918    [101, 1037, 5667, 2919, 2143, 2007, 5667, 2561...
6919    [101, 1037, 12090, 2135, 2512, 5054, 19570, 23...
Name: 0, Length: 6920, dtype: object

In [0]:
tokenized.values
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [14]:
#masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(200, 54)

In [0]:
#Run model

In [0]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [0]:
features = last_hidden_states[0][:,0,:].numpy()

In [0]:
labels = batch_1[1]

In [0]:
##Model #2 Train/Test Split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [0]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)


In [21]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
#Eval
lr_clf.score(test_features, test_labels)

0.72