In [6]:
import data
import models
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.feature_extraction.text import *
from sklearn.pipeline import make_pipeline
from scipy.sparse import hstack

In [7]:
# Obtain data
train_input, train_output, test_input, all_input = data.getDataFrom('../data')

In [None]:
# features extraction from the text
# Set pipelines for word and char feature extraction
count_char_vect = CountVectorizer(analyzer='char',ngram_range=(3, 6), max_features=15000)
tfidf_char_transformer = TfidfTransformer(sublinear_tf=True)
charPipe = make_pipeline(count_char_vect,tfidf_char_transformer)

count_word_vect = CountVectorizer(ngram_range=(1, 2), max_features = 10000)
tfidf_word_transformer = TfidfTransformer(sublinear_tf=True)
wordPipe = make_pipeline(count_word_vect,tfidf_word_transformer)

charPipe.fit(all_input['comment_text'])
wordPipe.fit(all_input['comment_text'])
# apply extraction
extra, word, char = data.getFeatures(train_input, wordPipe, charPipe)
features_train = hstack([extra,word,char])

# And for test data
extra_test, word_test, char_test = data.getFeatures(test_input, wordPipe, charPipe)
features_test = hstack([extra_test,word_test,char_test])
features_train.shape

In [None]:
# Choose the model
model = LogisticRegression(solver="sag")

In [None]:
# Apply a model with a crossval
prediction={'id':test_input['id']}
for output_name in train_output.columns:
    cv_loss = cross_val_score(model,features_train,train_output[output_name],cv = 5)
    print('CV score for column {} is {}'.format(output_name, cv_loss))
    model.fit(features_train,train_output[output_name])
    prediction[output_name] = model.predict_proba(features_test)[:, 1]

In [None]:
#Format submission
submission = pd.DataFrame.from_dict(prediction)
submission.to_csv('submission.csv', index=False)

In [22]:
# Debug Cell
features_train.shape
char.shape
analyze = count_char_vect.build_analyzer()
res = charPipe.transform(["Hey ! Sasfuhfhfqz65sd65fds6dsffsd5lutfdgsdggsd"])
res

<1x22 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

{'e': 3687,
 'x': 5521,
 'p': 4786,
 'l': 4362,
 'a': 3286,
 'n': 4570,
 't': 5162,
 'i': 4087,
 'o': 4683,
 '\n': 0,
 'w': 5447,
 'h': 3975,
 'y': 5588,
 ' ': 73,
 'd': 3585,
 's': 5054,
 'm': 4467,
 'u': 5278,
 'r': 4934,
 'c': 3487,
 'f': 3796,
 'v': 5368,
 '?': 2904,
 "'": 1301,
 ',': 1650,
 'j': 4195,
 'g': 3880,
 'k': 4265,
 '.': 1794,
 '8': 2516,
 '9': 2584,
 '2': 2107,
 '0': 1964,
 '5': 2315,
 '3': 2179,
 '7': 2447,
 'ex': 3751,
 'xp': 5570,
 'pl': 4833,
 'la': 4399,
 'an': 3339,
 'na': 4609,
 'at': 3345,
 'ti': 5209,
 'io': 4140,
 'on': 4735,
 'n\n': 4571,
 '\nw': 57,
 'wh': 5489,
 'hy': 4037,
 'y ': 5590,
 ' t': 131,
 'th': 5208,
 'he': 4017,
 'e ': 3689,
 ' e': 116,
 'ed': 3731,
 'di': 3632,
 'it': 4145,
 'ts': 5219,
 's ': 5056,
 ' m': 124,
 'ma': 4505,
 'ad': 3329,
 'de': 3628,
 ' u': 132,
 'un': 5328,
 'nd': 4612,
 'er': 3745,
 'r ': 4936,
 'my': 4529,
 'us': 5333,
 'se': 5099,
 'rn': 4986,
 'am': 3338,
 'me': 4509,
 ' h': 119,
 'ha': 4013,
 'ar': 3343,
 'rd': 4976,
 'dc'