In [5]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
scikit_version = sklearn.__version__

scikit_version

'0.23.1'

In [7]:
sentimental_data = pd.read_csv('../datasets/sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
4509,0,Da Vinci Code sucked..
1057,1,"I just came back from the movies, my mom and I..."
387,1,The Da Vinci Code is awesome..
3814,1,I love Brokeback Mountain....
3233,1,I love Brokeback Mountain.
1451,1,I like Mission Impossible movies because you n...
6690,0,", she helped me bobbypin my insanely cool hat ..."
1295,1,the last stand and Mission Impossible 3 both w...
904,1,da vinci code was an awesome movie...
4265,0,Da Vinci Code sucks.


In [8]:
sentimental_data.shape

(6918, 2)

In [9]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [11]:
x_train.shape, x_test.shape

((5534,), (1384,))

In [12]:
y_train.shape, y_test.shape

((5534,), (1384,))

In [13]:
tfidf_vect = TfidfVectorizer(max_features=15)

x_trans = tfidf_vect.fit_transform(x_train)

In [14]:
tfidf_vect

TfidfVectorizer(max_features=15)

In [15]:
print(x_trans[0:3])

  (0, 12)	0.4392923148296515
  (0, 7)	0.5541298676612787
  (0, 10)	0.4999811755942918
  (0, 2)	0.4999811755942918
  (1, 10)	0.7071067811865476
  (1, 2)	0.7071067811865476
  (2, 7)	0.6168347068270204
  (2, 10)	0.5565585972984445
  (2, 2)	0.5565585972984445


In [16]:
x_trans.shape

(5534, 15)

In [17]:
classifier = LinearSVC(C=1.0, max_iter=1000, tol=1e-3)
linear_svc_model = classifier.fit(x_trans, y_train)

linear_svc_model

LinearSVC(tol=0.001)

In [18]:
x_test_trans = tfidf_vect.fit_transform(x_test)

In [19]:
x_test_trans.shape

(1384, 15)

In [20]:
y_pred = linear_svc_model.predict(x_test_trans)

y_pred

array([0, 0, 1, ..., 0, 1, 0])

In [21]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(5)

Unnamed: 0,y_test,y_pred
2831,1,0
6135,0,0
4797,0,0
1032,1,1
5484,0,0


In [22]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7796242774566474

In [23]:
text_clf_param = {}

text_clf_param['preprocessing'] = tfidf_vect
text_clf_param['model'] = linear_svc_model
text_clf_param['sklearn_version'] = scikit_version
text_clf_param['accuracy'] = accuracy

In [24]:
text_clf_param

{'preprocessing': TfidfVectorizer(max_features=15),
 'model': LinearSVC(tol=0.001),
 'sklearn_version': '0.23.1',
 'accuracy': 0.7796242774566474}

In [25]:
import joblib

In [26]:
filename = '../models/text_clf_checkpoint.joblib'

In [27]:
joblib.dump(text_clf_param, filename)

['../models/text_clf_checkpoint.joblib']

In [28]:
clf_checkpoint = joblib.load(filename)

In [29]:
reloaded_vect = clf_checkpoint['preprocessing']

reloaded_vect

TfidfVectorizer(max_features=15)

In [30]:
clf_model = clf_checkpoint['model']

clf_model

LinearSVC(tol=0.001)

In [31]:
x_test_trans_new = reloaded_vect.fit_transform(x_test)

In [32]:
y_pred = clf_model.predict(x_test_trans_new)

y_pred

array([0, 0, 1, ..., 0, 1, 0])

In [33]:
accuracy_score(y_test, y_pred)

0.7796242774566474

In [34]:
clf_checkpoint['accuracy']

0.7796242774566474

In [35]:
from sklearn.pipeline import Pipeline

In [36]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', classifier)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [37]:
pipeline_model

Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                ('classifier', LinearSVC(tol=0.001))])

In [38]:
y_pred = pipeline_model.predict(x_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.8858381502890174

In [40]:
pipe_clf_param = {}

pipe_clf_param['pipeline_clf'] = pipeline_model
pipe_clf_param['sklearn_version'] = scikit_version
pipe_clf_param['accuracy'] = accuracy

In [41]:
pipe_clf_param

{'pipeline_clf': Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                 ('classifier', LinearSVC(tol=0.001))]),
 'sklearn_version': '0.23.1',
 'accuracy': 0.8858381502890174}

In [42]:
filename = '../models/pipe_clf_checkpoint.joblib'

In [43]:
joblib.dump(pipe_clf_param, filename)

['../models/pipe_clf_checkpoint.joblib']

In [44]:
pipe_clf_checkpoint = joblib.load(filename)

In [45]:
reloaded_pipeline = pipe_clf_checkpoint['pipeline_clf']

reloaded_pipeline

Pipeline(steps=[('tfidf_vect', TfidfVectorizer(max_features=15)),
                ('classifier', LinearSVC(tol=0.001))])

In [46]:
y_pred = reloaded_pipeline.predict(x_test)

In [47]:
accuracy_score(y_test, y_pred)

0.8858381502890174

In [48]:
pipe_clf_checkpoint['accuracy']

0.8858381502890174