# Sentiment Analysis on Faculty Evaluation comments

In [53]:
import pandas as pd
import numpy as np

### Load train and test sets

In [54]:
X_train = pd.read_csv("Raw_Train.csv", encoding = "ISO-8859-1")
X_test = pd.read_csv("Raw_Test.csv", encoding = "ISO-8859-1")

## Drop missing values

In [55]:
X_test = X_test.dropna()
X_train = X_train.dropna()

In [56]:
X_train = X_train.drop_duplicates(['Comment'],keep=False).reset_index(drop=True)
X_test = X_test.drop_duplicates(['Comment'],keep=False).reset_index(drop=True)

## Convert to lowercase

In [57]:
data_train = [comment.lower() for comment in list(X_train['Comment'])]
data_test = [comment.lower() for comment in list(X_test['Comment'])]

## Remove numbers

In [58]:
import re

In [59]:
data_train = [re.sub("[0-9]+", " ", text) for text in data_train]
data_test = [re.sub("[0-9]+", " ", text) for text in data_test]

## Remove stopwords & lemmatizer

In [60]:
import spacy
import en_core_web_sm

nlp = spacy.load('en_core_web_sm')

tl_stopwords_csv = pd.read_csv('Fil_Stop_Words.csv', encoding = "ISO-8859-1" )

tl_stopwords = [stop_word for stop_word in list(tl_stopwords_csv['words'])]

### Train

In [61]:
temp_text = data_train #TRAIN
data_train = []

                                          
for text, lang in zip(temp_text, list(X_train['Language'])):
    if lang == 'en':
        temp = []
        doc = nlp(str(text))
        for token in doc:
            if not token.is_stop:
                temp.append(token.lemma_) #WORD
        data_train.append(" ".join(str(word) for word in temp))
        
    elif lang == 'tl':
        temp = []
        for word in text.split():
            if not word in tl_stopwords:
                temp.append(word)
        data_train.append(" ".join(temp))

    else:
        temp = []
        doc = nlp(str(text))
        for word, token in zip(text.split(), doc):
            if not word in tl_stopwords and not token.is_stop:
                temp.append(word)
        data_train.append(" ".join(temp))

### Test

In [62]:
temp_text = data_test #TEST
data_test = []

                                           #sentiment
for text, lang in zip(temp_text, list(X_test['Language'])):
    if lang == 'en':
        temp = []
        doc = nlp(str(text))
        for token in doc:
            if not token.is_stop:
                temp.append(token.lemma_) #WORD
        data_test.append(" ".join(str(word) for word in temp))
        
    elif lang == 'tl':
        temp = []
        for word in text.split():
            if not word in tl_stopwords:
                temp.append(word)
        data_test.append(" ".join(temp))

    else:
        temp = []
        doc = nlp(str(text))
        for word, token in zip(text.split(), doc):
            if not word in tl_stopwords and not token.is_stop:
                temp.append(word)
        data_test.append(" ".join(temp))

## Convert emoticons to code

### Positive

In [63]:
data_train = [re.sub( "\:\)|\:\-\)|\:\"\)|\;\)|\:D|\:\)\)", " qwertyuiop ", text) for text in data_train]
data_test = [re.sub( "\:\)|\:\-\)|\:\"\)|\;\)|\:D|\:\)\)", " qwertyuiop ", text) for text in data_test]

### Negative

In [64]:
data_train = [re.sub("\:\(|\:\'\(", " asdfghjkl ", text) for text in data_train]
data_test = [re.sub("\:\(|\:\'\(", " asdfghjkl ", text) for text in data_test]

## Remove punctuations

In [65]:
import re

data_train = [re.sub("[^A-Za-z\w]+|_+", " ", text) for text in data_train]
data_test = [re.sub("[^A-Za-z\w]+|_+", " ", text) for text in data_test]

## Convert code to emoticon

### Positive

In [66]:
data_train = [re.sub("qwertyuiop", ":)", text) for text in data_train]
data_test = [re.sub("qwertyuiop", ":)", text) for text in data_test]

### Negative

In [67]:
data_train = [re.sub("asdfghjkl", ":(", text) for text in data_train]
data_test = [re.sub("asdfghjkl", ":(", text) for text in data_test]

## Remove one letter word

In [68]:
all_text = data_train
data_train = []
for text in all_text:
    temp = []
    for word in text.split():
        if not len(word) == 1:
            temp.append(word)
    data_train.append(" ".join(str(word) for word in temp))

In [69]:
all_text = data_test
data_test = []
for text in all_text:
    temp = []
    for word in text.split():
        if not len(word) == 1:
            temp.append(word)
    data_test.append(" ".join(str(word) for word in temp))

# Apply Algorithms

In [70]:
y_train = X_train['Sentiment']
y_test = X_test['Sentiment']

In [71]:
X_train = pd.Series(data_train)
X_test = pd.Series(data_test)

In [72]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

vectorizer = CountVectorizer()
X_train_comments = vectorizer.fit_transform(X_train)
svm = SVC()

#param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel': ['rbf', 'linear', 'poly'], }
param_grid = [
  {'C': [1, 10, 100, 1000,10000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['linear'],'probability': [True]},
  {'C': [1, 10, 100, 1000, 10000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], 'probability': [True]}
 ]
grid1 = GridSearchCV(svm,param_grid,verbose = 10)
grid1.fit(X_train_comments,y_train)
best_svm = grid1.best_estimator_



Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] C=1, gamma=0.1, kernel=linear, probability=True .................
[CV]  C=1, gamma=0.1, kernel=linear, probability=True, score=0.5605381165919282, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] C=1, gamma=0.1, kernel=linear, probability=True .................
[CV]  C=1, gamma=0.1, kernel=linear, probability=True, score=0.581081081081081, total=   0.0s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] C=1, gamma=0.1, kernel=linear, probability=True .................
[CV]  C=1, gamma=0.1, kernel=linear, probability=True, score=0.6227272727272727, total=   0.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


[CV] C=1, gamma=0.01, kernel=linear, probability=True ................
[CV]  C=1, gamma=0.01, kernel=linear, probability=True, score=0.5605381165919282, total=   0.0s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s


[CV] C=1, gamma=0.01, kernel=linear, probability=True ................
[CV]  C=1, gamma=0.01, kernel=linear, probability=True, score=0.581081081081081, total=   0.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s


[CV] C=1, gamma=0.01, kernel=linear, probability=True ................
[CV]  C=1, gamma=0.01, kernel=linear, probability=True, score=0.6227272727272727, total=   0.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s


[CV] C=1, gamma=0.001, kernel=linear, probability=True ...............
[CV]  C=1, gamma=0.001, kernel=linear, probability=True, score=0.5605381165919282, total=   0.0s


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s


[CV] C=1, gamma=0.001, kernel=linear, probability=True ...............
[CV]  C=1, gamma=0.001, kernel=linear, probability=True, score=0.581081081081081, total=   0.0s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s


[CV] C=1, gamma=0.001, kernel=linear, probability=True ...............
[CV]  C=1, gamma=0.001, kernel=linear, probability=True, score=0.6227272727272727, total=   0.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s


[CV] C=1, gamma=0.0001, kernel=linear, probability=True ..............
[CV]  C=1, gamma=0.0001, kernel=linear, probability=True, score=0.5605381165919282, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear, probability=True ..............
[CV]  C=1, gamma=0.0001, kernel=linear, probability=True, score=0.581081081081081, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear, probability=True ..............
[CV]  C=1, gamma=0.0001, kernel=linear, probability=True, score=0.6227272727272727, total=   0.0s
[CV] C=10, gamma=0.1, kernel=linear, probability=True ................
[CV]  C=10, gamma=0.1, kernel=linear, probability=True, score=0.5022421524663677, total=   0.0s
[CV] C=10, gamma=0.1, kernel=linear, probability=True ................
[CV]  C=10, gamma=0.1, kernel=linear, probability=True, score=0.6126126126126126, total=   0.0s
[CV] C=10, gamma=0.1, kernel=linear, probability=True ................
[CV]  C=10, gamma=0.1, kernel=linear, probability=True, score=0.6181818181818182, total= 

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   12.0s finished


In [73]:
print(best_svm)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [74]:
from sklearn.naive_bayes import MultinomialNB

m_nb = MultinomialNB()
vectorizer = CountVectorizer()
X_train_comments = vectorizer.fit_transform(X_train)
X_test_comments = vectorizer.fit_transform(X_test)


#param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel': ['rbf', 'linear', 'poly'], }
param_grid = [
  {'alpha': [1, 0.5, 0.1]}
 ]
grid2 = GridSearchCV(m_nb,param_grid,verbose = 10)
grid2.fit(X_train_comments,y_train)
best_nb = grid2.best_estimator_



Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] alpha=1 .........................................................
[CV] ................ alpha=1, score=0.5560538116591929, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=1 .........................................................
[CV] ................ alpha=1, score=0.6081081081081081, total=   0.0s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=1 .........................................................
[CV] ................ alpha=1, score=0.5954545454545455, total=   0.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.5 .......................................................
[CV] .............. alpha=0.5, score=0.5022421524663677, total=   0.0s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.5 .......................................................
[CV] .............. alpha=0.5, score=0.5990990990990991, total=   0.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.5 .......................................................
[CV] .............. alpha=0.5, score=0.5954545454545455, total=   0.0s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.1 .......................................................
[CV] ............. alpha=0.1, score=0.37668161434977576, total=   0.0s


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.1 .......................................................
[CV] ............... alpha=0.1, score=0.527027027027027, total=   0.0s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=0.1 .......................................................
[CV] .............. alpha=0.1, score=0.5181818181818182, total=   0.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s finished


In [75]:
best_nb

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([('vectorizer', CountVectorizer()),('clf', best_nb)])
clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1, class_prior=None, fit_prior=True))])

In [77]:
from sklearn.svm import SVC

clf2 = Pipeline([('vectorizer', CountVectorizer()),('clf2', best_svm)])
clf2.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [78]:
pred = clf.predict(X_test) #Negative - Neutral - Positive

In [79]:
X_train_comments

<665x1384 sparse matrix of type '<class 'numpy.int64'>'
	with 3212 stored elements in Compressed Sparse Row format>

In [80]:
for vector, p in zip(X_train_comments.toarray(),y_train):
    print(vector,p)

[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 0
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 1 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 0
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 0
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1
[0 0 0 ... 0 0 0] -1
[0 0 0 ... 0 0 0] 1


In [81]:
pred_sentiment = clf.predict(X_test)
pred = clf.predict_proba(X_test)
neg_pred=[]
neut_pred=[]
pos_pred=[]

for i in range(0,len(pred)):
    neg_pred.append(pred[i][0])
    neut_pred.append(pred[i][1])
    pos_pred.append(pred[i][2])

In [82]:
result_data = list(zip(X_test, y_test, pred_sentiment, neg_pred, neut_pred,pos_pred))
nb_data = pd.DataFrame(result_data, columns = ["Comment","Actual","Predicted","Negative Prob","Neutral Prob","Positive Prob"])

In [83]:
nb_wrong = []
for i in range(0,len(result_data)):
    if(pred_sentiment[i] != result_data[i][1]):
        nb_wrong.append(result_data[i])      

In [84]:
pd.DataFrame(nb_wrong, columns = ["Comment","Actual","Predicted","Negative Prob","Neutral Prob","Positive Prob"])

Unnamed: 0,Comment,Actual,Predicted,Negative Prob,Neutral Prob,Positive Prob
0,joke funny,0,-1,0.762621,0.028905,0.208474
1,pinapahirap lessons inaral youtube dali,-1,1,0.320436,0.01723,0.662334
2,namamahiya nang estudyante,1,-1,0.529453,0.001575,0.468971
3,core university visible way teach,1,-1,0.499727,0.004673,0.4956
4,website teacher naming kasi dun nirerefer,-1,1,0.389899,0.002269,0.607832
5,wala book diniscuss nasa exams nakuha ata moon...,1,-1,0.993014,0.002505,0.004482
6,get late,1,-1,0.69979,0.013262,0.286948
7,sobrang panget pagmumukha,-1,1,0.429437,0.056968,0.513595
8,gumagamit visuals,-1,1,0.314577,0.083462,0.601962
9,approach student feel student problem,1,-1,0.722433,0.004977,0.272589


In [85]:
list_svm_data = []
data_highproba = []

threshold = .70

for i in range(0, len(result_data)):
    
    #Get the 75% Confusion Matrix
    
    if(result_data[i][3] < threshold) and (result_data[i][4] < threshold) and (result_data[i][5] < threshold):
        list_svm_data.append(result_data[i])
    
    if(result_data[i][3] >= threshold) or (result_data[i][4] >= threshold) or (result_data[i][5] >= threshold):
        data_highproba.append(result_data[i])
        
svm_data = pd.DataFrame(list_svm_data, columns = ["Comment","Actual","Predicted","Negative Prob","Neutral Prob","Positive Prob"])

In [86]:
actual_highproba = []
predicted_highproba = []
comment_highproba = []
for row in data_highproba:
    comment_highproba.append(row[0])
    actual_highproba.append(row[1])
    predicted_highproba.append(row[2])

In [87]:
data_highproba_df = pd.DataFrame({'Comment' :comment_highproba,
                                 'Actual': actual_highproba,
                                 'Predicted': predicted_highproba
                                })

In [88]:
data_highproba_df

Unnamed: 0,Comment,Actual,Predicted
0,binabasa powerpoint,-1,-1
1,chill laid style teaching maayos chill lang,1,1
2,say cod fish be whale stay alive escape be go ...,1,1
3,joke funny,0,-1
4,manyak if not get grade mehn biased,-1,-1
5,absent,-1,-1
6,binabasa libro nagtuturo,-1,-1
7,teacher father student,1,1
8,absent,-1,-1
9,maganda gamit visuals,1,1


In [89]:
actual_highproba = pd.Series(actual_highproba)
predicted_highproba = pd.Series(predicted_highproba)

In [90]:
y_test = svm_data.Actual
X_test = list(np.array(svm_data.Comment))

In [91]:
svm_pred = clf2.predict(X_test)

In [92]:
svm_pred

array([-1,  1,  1,  1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1],
      dtype=int64)

In [93]:
svm_wrong = []
for i in range(0,len(list_svm_data)):
    if(svm_pred[i] != list_svm_data[i][1]):
        svm_wrong.append(list_svm_data[i])


In [94]:
pred2 = clf2.predict_proba(svm_data.Comment)
svm_neg_pred=[]
svm_neut_pred=[]
svm_pos_pred=[]
svm_comment = []
svm_actual = []
svm_predicted = []

for i in range(0,len(pred2)):
    svm_neg_pred.append(pred2[i][0])
    svm_neut_pred.append(pred2[i][1])
    svm_pos_pred.append(pred2[i][2])
    svm_comment.append(list_svm_data[i][0])
    svm_actual.append(list_svm_data[i][1])
    svm_predicted.append(list_svm_data[i][2])

In [95]:
list_svm_pred_data = list(zip(svm_comment,svm_actual,svm_predicted, svm_neg_pred, svm_neut_pred,svm_pos_pred))

In [96]:
data_lowproba_df = pd.DataFrame({
    'Comment': svm_comment,
    'Actual': y_test,
    'Predicted': svm_pred
})

In [97]:
data_lowproba_df

Unnamed: 0,Comment,Actual,Predicted
0,punctual professor give importance reward stud...,1,-1
1,follow syllabus time,1,1
2,pinapahirap lessons inaral youtube dali,-1,1
3,treat respect,1,1
4,namamahiya nang estudyante,1,-1
5,core university visible way teach,1,1
6,marunong umintindi umunawa estudyante,1,1
7,website teacher naming kasi dun nirerefer,-1,1
8,get late,1,-1
9,sobrang panget pagmumukha,-1,1


In [98]:
frames = [data_highproba_df, data_lowproba_df]

In [99]:
final_df = pd.concat(frames, ignore_index=True)

## ACCURACY

In [100]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(confusion_matrix(actual_highproba,predicted_highproba))
print(classification_report(actual_highproba,predicted_highproba))
print("Accuracy: ",accuracy_score(actual_highproba,predicted_highproba))

[[13  0  2]
 [ 1  0  0]
 [ 3  0 24]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

          -1       0.76      0.87      0.81        15
           0       0.00      0.00      0.00         1
           1       0.92      0.89      0.91        27

   micro avg       0.86      0.86      0.86        43
   macro avg       0.56      0.59      0.57        43
weighted avg       0.85      0.86      0.85        43

Accuracy:  0.8604651162790697


In [101]:
print(confusion_matrix(y_test,svm_pred))
print(classification_report(y_test,svm_pred))
print("Accuracy Score: ",accuracy_score(y_test,svm_pred))
svm_corrects = accuracy_score(y_test,svm_pred, normalize = False)

[[8 0 4]
 [5 0 1]
 [5 0 8]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

          -1       0.44      0.67      0.53        12
           0       0.00      0.00      0.00         6
           1       0.62      0.62      0.62        13

   micro avg       0.52      0.52      0.52        31
   macro avg       0.35      0.43      0.38        31
weighted avg       0.43      0.52      0.46        31

Accuracy Score:  0.5161290322580645


# Export model

```python
import pandas as pd

data = pd.read_csv('all_data.csv', encoding = "ISO-8859-1", header=None, names=['Comment', 'Tag'])
```

```python
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
comments = vectorizer.fit_transform(data['Comment'])
labels = data['Tag']
svm = SVC()
from sklearn.grid_search import GridSearchCV
param_grid = [
  {'C': [1, 10, 100, 1000,10000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['linear'],'probability': [True]},
  {'C': [1, 10, 100, 1000, 10000], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], 'probability': [True]}
 ]
grid1 = GridSearchCV(svm,param_grid,verbose = 10)
grid1.fit(comments,labels)
best_svm = grid1.best_estimator_
```

```python
from sklearn.naive_bayes import MultinomialNB

m_nb = MultinomialNB()

from sklearn.grid_search import GridSearchCV
#param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel': ['rbf', 'linear', 'poly'], }
param_grid = [
  {'alpha': [1, 0.5, 0.1]}
 ]
grid2 = GridSearchCV(m_nb,param_grid,verbose = 10)
grid2.fit(comments,labels)
best_nb = grid2.best_estimator_
```

```python
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([('vectorizer', CountVectorizer()),('clf', best_nb)])
clf.fit(data['Comment'],data['Tag'])
```

```python
from sklearn.svm import SVC

clf2 = Pipeline([('vectorizer', CountVectorizer()),('clf2', best_svm)])
clf2.fit(data['Comment'],data['Tag'])
```

```python
from joblib import dump
```

```python
dump(clf, 'clf_nb.joblib')
```

```python
dump(clf2, 'clf_svm.joblib')
```