#### Craigslist post classification challenge : HackerRank
https://www.hackerrank.com/challenges/craigslist-post-classifier-the-category

#### As part of the experimentation with text classification:
I'm following this link :
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

##### Steps:
##### - convert the header part of the data as shown in the link
##### - construct a set of features using the transformed header along with the other features
##### - train models and test their performance
##### - plot some charts

In [1]:
import pandas as pd
import numpy as np
import sys
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
#Read the training data
train = open("training.json")

nb_train = int(train.readline())

X_train = []
y_train = []
for i in range(nb_train):
    line = train.readline()
    X_list = []
    X_list.append(ast.literal_eval(line)['city'])
    y_train.append(ast.literal_eval(line)['category'])
    X_list.append(ast.literal_eval(line)['section'])
    X_list.append(ast.literal_eval(line)['heading'])
    X_train.append(X_list)
    
X_train = np.array(X_train)

In [3]:
#Reading the sample test data 
test = open("sample-test.in.json")

nb_test = int(test.readline())

X_test = []
for i in range(nb_test):
    line = test.readline()
    X_list = []
    X_list.append(ast.literal_eval(line)['city'])
    X_list.append(ast.literal_eval(line)['section'])
    X_list.append(ast.literal_eval(line)['heading'])
    X_test.append(X_list)

X_test = np.array(X_test)

In [4]:
data = np.concatenate((X_train, X_test), axis = 0 )


countvec = CountVectorizer(min_df=0)
tfidf_transformer = TfidfTransformer()

# The first Idea that I had was to apply bags of words to the header feature only, but there seems to be an issue,
#so I'm putting this approach on hold for now, I'm going to use the header feature only.
data_counts = countvec.fit_transform(data[:,2])

data_trans = tfidf_transformer.fit_transform(data_counts)

X_train_trans = data_trans[:nb_train]

X_test_trans = data_trans[nb_train: nb_train+nb_test]

#### Multinomial Naive Bayes Classifier 
###### useful tutorial : http://blog.datumbox.com/machine-learning-tutorial-the-naive-bayes-text-classifier/

In [5]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_trans, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [6]:
clf.score(X_train_trans, y_train)

0.76663204234060445

In [9]:
res = clf.predict(X_test_trans)  

In [13]:
res.shape

(15370,)

In [23]:
result = open("sample-test.out.json")
y_test=[]
for i in range(res.shape[0]):
    y_test.append(result.readline().replace("\n",""))

In [24]:
y_test = np.array(y_test)

In [29]:
from __future__ import division
np.sum(y_test == res)/res.shape[0]

0.70481457384515289

In [30]:
clf.score(X_test_trans,y_test)

0.70481457384515289

##### Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train_trans, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [38]:
RF.score(X_train_trans, y_train)

0.89033981302863929

In [39]:
RF.score(X_test_trans,y_test)

0.71268705270006505

##### Support Vector Classifier

In [40]:
from sklearn.svm import SVC
SVC = SVC()
SVC.fit(X_train_trans, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
SVC.score(X_train_trans, y_train)

0.11243013305633873

##### Gaussian Naive Bayes Classifier

In [43]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_train_trans.toarray(), y_train)

GaussianNB(priors=None)

In [45]:
NB.score(X_train_trans.toarray(), y_train)

0.77212247118761435

In [46]:
NB.score(X_test_trans.toarray(), y_test)

0.60611581001951853

##### K Nearest Neighbor Classifier

In [47]:
from sklearn.neighbors import KNeighborsClassifier
KNNC = KNeighborsClassifier()
KNNC.fit(X_train_trans, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [48]:
KNNC.score(X_train_trans, y_train)

0.76969876836325868

In [49]:
KNNC.score(X_test_trans, y_test)

0.68568640208197784

In [50]:
print nb_test

15370
