In [6]:
import data
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.feature_extraction.text import *
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from scipy.sparse import hstack

In [2]:
# Obtain data
train_input, train_output, test_input, all_input = data.getDataFrom('../data')

In [3]:
# features extraction from the text
# Set pipelines for word and char feature extraction
count_char_vect = CountVectorizer(analyzer='char',ngram_range=(3, 6), max_features=15000)
tfidf_char_transformer = TfidfTransformer(sublinear_tf=True)
charPipe = make_pipeline(count_char_vect,tfidf_char_transformer)

count_word_vect = CountVectorizer(ngram_range=(1, 2), max_features = 10000)
tfidf_word_transformer = TfidfTransformer(sublinear_tf=True)
wordPipe = make_pipeline(count_word_vect,tfidf_word_transformer)

charPipe.fit(train_input['comment_text'])
wordPipe.fit(train_input['comment_text'])
# apply extraction
extra, word, char = data.getFeatures(train_input, wordPipe, charPipe)
features_train = hstack([extra,word,char])

# And for test data
extra_test, word_test, char_test = data.getFeatures(test_input, wordPipe, charPipe)
features_test = hstack([extra_test,word_test,char_test])
features_train.shape

(159571, 25011)

In [4]:
# Choose the model (from cell)
# Logistic Regression
model = LogisticRegression(C=10)

In [None]:
# LSA + Logistic Regression
SVD = decomposition.TruncatedSVD(n_components=50)
normalizer = Normalizer(copy=False)
logreg = LogisticRegression(C=5)
model = make_pipeline(SVD, normalizer,logreg)

In [9]:
# LSA + SVM
SVD = decomposition.TruncatedSVD(n_components=50)
normalizer = Normalizer(copy=False)
svc = SVC(C=1.0, kernel="rbf", probability=True)
model = make_pipeline(SVD, normalizer,svc)

In [None]:
# Apply the model with a crossval
prediction={'id':test_input['id']}
for output_name in train_output.columns:
    cv_loss = cross_val_score(model,features_train,train_output[output_name],cv = 5)
    print('CV score for column {} is {}'.format(output_name, cv_loss))
    model.fit(features_train,train_output[output_name])
    prediction[output_name] = model.predict_proba(features_test)[:, 1]

In [6]:
#Format submission
submission = pd.DataFrame.from_dict(prediction)
submission.to_csv('submission.csv', index=False)

In [22]:
# Debug Cell(s)
features_train.shape
char.shape
analyze = count_char_vect.build_analyzer()
res = charPipe.transform(["Hey ! Sasfuhfhfqz65sd65fds6dsffsd5lutfdgsdggsd"])
res

<1x22 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [14]:
prediction['train'] = model.predict_proba(features_train)[:, 1]

In [23]:
prediction['train'][159557]

0.07269556065122973