In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd


In [2]:
train = pd.read_csv('./data/train.tsv', sep="\t")
test = pd.read_csv('./data/test.tsv', sep="\t")


In [3]:
train.head(3)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2


In [4]:
test.head(3)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An


In [12]:
words = TfidfVectorizer(analyzer="word", use_idf=True, min_df=3, ngram_range= (1,5))
char = TfidfVectorizer(analyzer="char", binary=False, use_idf=True)


In [13]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

feat = FeatureUnion([('words', words), ('char', char)])
text_clf = Pipeline([('feat', feat),
                     ('clf', SGDClassifier(loss='hinge', alpha= 1e-05, max_iter=1000))
])


In [14]:
text_clf.fit(train['Phrase'], train['Sentiment'])

Pipeline(memory=None,
     steps=[('feat', FeatureUnion(n_jobs=1,
       transformer_list=[('words', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [15]:
predictions = text_clf.predict(test['Phrase'])


In [16]:
predictions


array([3, 3, 2, ..., 2, 2, 1])

In [17]:
output = pd.DataFrame({ 'PhraseId': test['PhraseId'],
                            'Sentiment': predictions })
output.to_csv("MySubmission.csv", index=False)

## Links:
### http://michelleful.github.io/code-blog/2015/06/20/pipelines/ -- Using Pipelines and FeatureUnions in scikit-learn
### http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html -- Using scikit-learn Pipelines and FeatureUnions

