In [1]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from nltk.stem import PorterStemmer
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



In [2]:
def preprocess(text):
    return text.replace(" 't", " not")

def create_pipeline(vectorizer, classifier):
    return Pipeline(
        [("vectorizer", vectorizer),
         ("classifier", classifier)]
        )

def stemmed(text):
    stemmer = PorterStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return (stemmer.stem(word) for word in analyzer(preprocess(text)))

In [4]:
union = FeatureUnion(
    [("word11", TfidfVectorizer(ngram_range=(1,1), analyzer='word')),
     ("stem11", TfidfVectorizer(ngram_range=(1,1), analyzer=stemmed)),
     ("word23", TfidfVectorizer(ngram_range=(2,3), analyzer='word')),
     ("stem23", TfidfVectorizer(ngram_range=(2,3), analyzer=stemmed)),
     ("char14", TfidfVectorizer(ngram_range=(1,4), analyzer='char'))
    ]
)

pipeline = create_pipeline(union, LinearSVC())

In [5]:
data = pd.read_csv("../../data/products_sentiment_train.tsv", sep='\t', header=None, names=["text", "y"])
pred_data = pd.read_csv("../../data/products_sentiment_test.tsv", sep="\t")
data.head() 

Unnamed: 0,text,y
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [6]:
data.text = preprocess(data.text)
pred_data.text = preprocess(pred_data.text)

In [7]:
pipeline.fit(data.text, data.y);

In [8]:
joblib.dump(pipeline, '../pipline_model.pkl')

['../pipline_model.pkl']