In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
features = pd.read_csv('features_prep_join.csv')


In [3]:
df = pd.read_csv('data_proc2.csv')

In [4]:
features.shape

(10876, 769)

The labels indicating which sentence is positive and negative now go into the `labels` variable

In [5]:
labels = df["target"]

In [6]:
#7613 - 36 удаленных
l = 7577
train_f = features[:l]
test_f = features[l:]

train_l = labels[:l]
test_l = labels[l:]

print(len(train_l))

7577


## Model #2: Train/Test Split
Let's now split our datset into a training set and testing set (even though we're using 2,000 sentences from the SST2 training set).

In [7]:
train_features, test_features, train_labels, test_labels = train_test_split(train_f, train_l, shuffle="False")

In [8]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.595390018209521


We now train the LogisticRegression model. If you've chosen to do the gridsearch, you can plug the value of C into the model declaration (e.g. `LogisticRegression(C=5.2)`).

In [9]:
lr_clf = LogisticRegression( C = grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.263252631578947, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
lr_clf.score(test_features, test_labels)

0.6226912928759895

How good is this score? What can we compare it against? Let's first look at a Naive Baiyes:

In [11]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

scores = cross_val_score(clf, train_features, train_labels)
print("GNB classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

GNB classifier score: 0.585 (+/- 0.02)


In [12]:
np.array(test_labels).reshape(-1, 1).shape

(1895, 1)

In [15]:
paramC = np.linspace(0.001, 100, 10)
paramtol =  [1e-3, 1e-4]
from sklearn.svm import LinearSVC
results=[]

for c in paramC:
    for t in paramtol:
        print(c,t)
        svc = LinearSVC(dual = False, C = c, tol = t)
        scores = cross_val_score(svc, train_features, train_labels)
        results.append([svc.get_params, scores])
        print(scores.mean())

0.001 0.001
0.5828935795954265
0.001 0.0001
0.5948624685668984
11.111999999999998 0.001
0.5878223828458881
11.111999999999998 0.0001
0.5913428902349895
22.223 0.001
0.5878223828458881
22.223 0.0001
0.5931040729867332
33.333999999999996 0.001
0.5878223828458881
33.333999999999996 0.0001
0.5904636924449068
44.44499999999999 0.001
0.5878223828458881
44.44499999999999 0.0001
0.5908161148066942
55.55599999999999 0.001
0.5878223828458881
55.55599999999999 0.0001
0.5922220880250721
66.667 0.001
0.5878223828458881
66.667 0.0001
0.5934571147199821
77.77799999999999 0.001
0.5878223828458881
77.77799999999999 0.0001
0.5922248751966505
88.889 0.001
0.5878223828458881
88.889 0.0001
0.594335383452872
100.0 0.001
0.5878223828458881
100.0 0.0001
0.5920475801156986


In [66]:
for i in results:
    print(i[1].mean(), i[0])

0.5705736618479567 <bound method BaseEstimator.get_params of LinearSVC(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
          verbose=0)>
0.7777233763177127 <bound method BaseEstimator.get_params of LinearSVC(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
          verbose=0)>
0.8028895227123515 <bound method BaseEstimator.get_params of LinearSVC(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)>
0.5705736618479567 <bound method BaseEstimator.get_params of LinearSVC(C=11.111999999999998, class_weight=None, dual=F

In [69]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=78, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001)
clf.fit(train_features, np.array(train_labels).reshape(-1, 1))
print(test_features.shape, test_labels.shape)
scores = clf.score(test_features, np.array(test_labels).reshape(-1, 1))
print("LinearSVC classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

(1895, 770) (1895,)
LinearSVC classifier score: 0.808 (+/- 0.00)


In [18]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_clf.fit(train_features, np.array(train_labels).reshape(-1, 1))

scores = sgd_clf.score(test_features, np.array(test_labels).reshape(-1, 1))
print("SGD classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

SGD classifier score: 0.420 (+/- 0.00)


In [58]:
Y_pred = clf.predict(test_f)
len(Y_pred)

3263

In [None]:
Y_pred = lr_clf.predict(test_f)
len(Y_pred)

In [62]:
import time
submission = pd.DataFrame({"id": df["id"][l:], "target": Y_pred }, dtype = "int")
fname = '' + time.ctime() + 'submission.csv'
submission.to_csv(fname, index=False, header="id,target")

So our model clearly does better than a dummy classifier. But how does it compare against the best models?

## Proper SST2 scores
For reference, the [highest accuracy score](http://nlpprogress.com/english/sentiment_analysis.html) for this dataset is currently **96.8**. DistilBERT can be trained to improve its score on this task – a process called **fine-tuning** which updates BERT’s weights to make it achieve a better performance in this sentence classification task (which we can call the downstream task). The fine-tuned DistilBERT turns out to achieve an accuracy score of **90.7**. The full size BERT model achieves **94.9**.



And that’s it! That’s a good first contact with BERT. The next step would be to head over to the documentation and try your hand at [fine-tuning](https://huggingface.co/transformers/examples.html#glue). You can also go back and switch from distilBERT to BERT and see how that works.