In [1]:
import numpy as np # linear algebra
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.svm import LinearSVC

In [2]:
df_original = pd.read_csv('train_40k.csv')

In [3]:
# the only columns I care about are these two, as I'll be using the 'Text' column to predict the feature 'Cat1'

columns = ['Text', 'Cat1']

df = shuffle(df_original[columns])

In [4]:
df.Cat1.value_counts()

toys games              10266
health personal care     9772
beauty                   5846
baby products            5637
pet supplies             4862
grocery gourmet food     3617
Name: Cat1, dtype: int64

In [5]:
# data needs cleaning
df

Unnamed: 0,Text,Cat1
11015,I was looking through the stuff on Amazon the ...,health personal care
33137,I have been using Sensodyne Toothpaste for the...,health personal care
4003,"This adorable, little, green bear is Erin, and...",toys games
35629,This is such a feminine scent; mysterious ambe...,beauty
26977,Unless you have legs like a chicken forget thi...,health personal care
...,...,...
12047,The Apex Pocket Med Pack with 7 Day Tray solve...,health personal care
25402,sizes are much smaller than what is recomended...,health personal care
7323,We bought this for our then 4-year-old - and a...,toys games
2768,I started off with the Avents. My son had the ...,baby products


In [6]:
#remove special characters from df using regular expressions

import re
p = re.compile(r'[^\w\s]+')

df['Text'] = [p.sub('', x) for x in df['Text'].tolist()]

In [7]:
# make all characters .lower()

df.apply(lambda x: x.astype(str).str.lower())


Unnamed: 0,Text,Cat1
11015,i was looking through the stuff on amazon the ...,health personal care
33137,i have been using sensodyne toothpaste for the...,health personal care
4003,this adorable little green bear is erin and he...,toys games
35629,this is such a feminine scent mysterious amber...,beauty
26977,unless you have legs like a chicken forget thi...,health personal care
...,...,...
12047,the apex pocket med pack with 7 day tray solve...,health personal care
25402,sizes are much smaller than what is recomended...,health personal care
7323,we bought this for our then 4yearold and at a...,toys games
2768,i started off with the avents my son had the h...,baby products


In [8]:
# train test split
x,y = df.Text, df.Cat1
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
# build a pipeline 

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,2), stop_words='english', sublinear_tf=True)),
                     ('chi', SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1',max_iter=3000, dual=False))
                    ])

In [10]:
# fit pipline to our training data

model = pipeline.fit(train_x, train_y)

In [11]:
# check accuracy

print('accuracy score: '+ str(model.score(test_x, test_y)))

accuracy score: 0.84225


In [12]:
print(model.predict(['bone lasted forever, will buy again']))

['pet supplies']


In [13]:
print(model.predict(['nice lipstick']))


['beauty']
