## Baseline development

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

### SVM

In [2]:
train = pd.read_csv('../input/wine-preferences/train_cleaned.csv',converters={"text": lambda x: ' '.join(x.strip("[]").replace("'", '').split(", "))} , index_col=0)
test = pd.read_csv('../input/wine-preferences/test_cleaned.csv',converters={"text": lambda x: ' '.join(x.strip("[]").replace("'", '').split(", "))} , index_col=0)

In [3]:
train.head()

Unnamed: 0,label,text
0,1,sound track beauti paint seneri mind well woul...
1,1,im read lot review say best game soundtrack fi...
2,1,soundtrack favorit music time hand intens sad ...
3,1,truli like soundtrack enjoy video game music p...
4,1,youv play game know divin music everi singl so...


In [4]:
train_num = 500

In [5]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(train['text'])


train_X_Tfidf = tfidf_vect.transform(train['text'].iloc[0:train_num])


In [6]:
test_X_Tfidf = tfidf_vect.transform(test['text'])

In [7]:
train_X_Tfidf

<500x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 14860 stored elements in Compressed Sparse Row format>

## Linear kernel

In [8]:
svm_model = SVC(kernel='linear')
svm_model.fit(train_X_Tfidf, train['label'].iloc[0:train_num])

SVC(kernel='linear')

In [9]:
train_predicted = svm_model.predict(train_X_Tfidf)
test_predicted = svm_model.predict(test_X_Tfidf)

In [10]:
print('Linear kernel')
print('Test acc: {}'.format(accuracy_score(test['label'], test_predicted )))
print('Test F1: {}'.format(f1_score(test['label'], test_predicted)))

Linear kernel
Test acc: 0.72119
Test F1: 0.7213376910237573


## Polynomial kernel

In [11]:
svm_model = SVC(kernel='poly')
svm_model.fit(train_X_Tfidf, train['label'].iloc[0:train_num])

SVC(kernel='poly')

In [12]:
train_predicted = svm_model.predict(train_X_Tfidf)
test_predicted = svm_model.predict(test_X_Tfidf)

In [13]:
print('Polynomial kernel')
print('Test acc: {}'.format(accuracy_score(test['label'], test_predicted )))
print('Test F1: {}'.format(f1_score(test['label'], test_predicted)))

Polynomial kernel
Test acc: 0.6175825
Test F1: 0.6949670672832453


## Sigmoid kernel

In [14]:
svm_model = SVC(kernel='sigmoid')
svm_model.fit(train_X_Tfidf, train['label'].iloc[0:train_num])

SVC(kernel='sigmoid')

In [15]:
train_predicted = svm_model.predict(train_X_Tfidf)
test_predicted = svm_model.predict(test_X_Tfidf)

In [16]:
print('Sigmoid kernel')
print('Test acc: {}'.format(accuracy_score(test['label'], test_predicted )))
print('Test F1: {}'.format(f1_score(test['label'], test_predicted)))

Sigmoid kernel
Test acc: 0.7221925
Test F1: 0.723910467118356
