In [1]:
import data
import models
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.feature_extraction.text import *
from sklearn.pipeline import make_pipeline
from scipy.sparse import hstack

In [None]:
# Obtain data
train_input, train_output, test_input = data.getDataFrom('../data')

In [None]:
# features extraction from the text
# Set pipelines for word and char feature extraction
count_char_vect = CountVectorizer(analyzer='char',ngram_range=(1, 2))
tfidf_char_transformer = TfidfTransformer()
charPipe = make_pipeline(count_char_vect,tfidf_char_transformer)

count_word_vect = CountVectorizer(ngram_range=(1, 2))
tfidf_word_transformer = TfidfTransformer()
wordPipe = make_pipeline(count_word_vect,tfidf_word_transformer)

# apply extraction
extra, word, char, wordPipe, charPipe = data.getFeatures(train_input,True,True,True,wordPipe,charPipe)
features_train = hstack([extra,word,char])

# And for test data
extra_test, word_test, char_test, wordPipe, charPipe = data.getTestFeatures(test_input,True,True,True,wordPipe,charPipe)
features_test = hstack([extra_test,word_test,char_test])

In [None]:
# Choose the model
model = make_pipeline(decomposition.TruncatedSVD(n_components=3),preprocessing.Normalizer(copy=False),LogisticRegression(C=10))

In [None]:
# Apply a model with a crossval
prediction={'id':test_input['id']}
for output_name in train_output.columns:
    cv_loss = cross_val_score(model,features_train,train_output[output_name],cv = 5)
    print('CV score for column {} is {}'.format(output_name, cv_loss))
    model.fit(features_train,train_output[output_name])
    prediction[output_name] = model.predict_proba(features_test)[:, 1]

In [None]:
#Format submission
submission = pd.DataFrame.from_dict(prediction)
submission.to_csv('submission.csv', index=False)

In [16]:
# Debug Cell
prediction
features_test.shape
test_input.shape

(159571, 8)