In [140]:
import json

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

In [130]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1, 1), sublinear_tf=True)
hashing_vectorizer = HashingVectorizer(non_negative=True, ngram_range=(1, 3), n_features = 2**18, decode_error="ignore")
tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=True, smooth_idf=True)

In [15]:
raw_data = {}
with open('data.json') as data_file:    
    raw_data = json.load(data_file)

In [21]:
tags = []

for item in raw_data.items():
    tags += item[1]["features"].keys()
    
tags = set(tags)

In [23]:
def extract_target(data):
    target = []
    for _, item in data.items():
        t = -1 if item["target"] == 0 else 1
        target.append(t)
    return target

In [95]:
def extract_text_data(data, keys):
    text_data = []

    for id, item in data.items():
        bag = []

        for key in keys:
            if key in item["features"].keys():
                words = item["features"][key]["words"]
                bag += words
#                 bag += [''.join(b) for b in zip(words, words[1:])] # 2-grams
#                 bag += [''.join(b) for b in zip(words, words[1:], words[2:])] # 3-grams

        text_data.append(" ".join(bag))
        
    return text_data

In [96]:
text_data = extract_text_data(raw_data, tags)

In [38]:
tfidf_data = vectorizer.fit_transform(text_data)

In [131]:
hash_data = hashing_vectorizer.fit_transform(text_data)

In [None]:
hashing_vectorizer.transform()

In [133]:
y = extract_target(raw_data)

In [141]:
# joblib.dump(hashing_vectorizer, "models/hashing_vectorizer")

['models/hashing_vectorizer']

In [134]:
def model_1_lg(X, y):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y,
                         test_size=0.33, random_state=42)
    
    clf = LogisticRegression(penalty="l1", C=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print f1_score(y_test, y_pred)

model_1_lg(hash_data, y)

0.953652206155


In [135]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [149]:
def model_2_pa(X, y):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y,
                         test_size=0.05, random_state=42)
    
    clf = PassiveAggressiveClassifier(C=10)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print f1_score(y_test, y_pred)
    
    return clf

clf = model_2_pa(hash_data, y)

0.961439588689


In [150]:
joblib.dump(clf, "models/model_2_pa")

['models/model_2_pa',
 'models/model_2_pa_01.npy',
 'models/model_2_pa_02.npy',
 'models/model_2_pa_03.npy',
 'models/model_2_pa_04.npy']

In [145]:
from sklearn.naive_bayes import MultinomialNB

In [148]:
def model_3_nb(X, y):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y,
                         test_size=0.33, random_state=42)
    
    clf = MultinomialNB(alpha=0.3)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print f1_score(y_test, y_pred)

model_3_nb(hash_data, y)

0.937010808796


In [139]:
for id, item in raw.items():
    item["target"]
    

NameError: name 'raw' is not defined