In [11]:
# linear algebra
import numpy as np
# data processing and CSV IO
import pandas as pd
# the actual ML stuff
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [4]:
# read the provided data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

`CountVectorizer`: convert collection of text documents to matrix of token (word) counts. 

In [35]:
count_vectorizer = feature_extraction.text.CountVectorizer()

# Learn vocabulary dictionary and return term-document matrix
train_vectors = count_vectorizer.fit_transform(train_df["text"])
# Extract token counts using vocabulary fitted from training data
test_vectors = count_vectorizer.transform(test_df["text"])

### Linear correlation: presence of a word, tweet is real?

Use a classifier with Ridge regression. It first convets target values into {-1, 1}, then treats problem as regression task.

In [36]:
clf = linear_model.RidgeClassifier()

Evaluate the classifier by cross-validation.

In [21]:
# Passing the estimator object (classifier), the data to fit (training vecotrs), 
#   the data to try to predict (the target value in training data), cross-validation splitting strategy specifying
#   three folds, and the metric used for scoring (F1 in this competition)
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.56455572, 0.64051005])

In [37]:
# Fit the model, passing: training data shape (n_samples, n_features), target values shape (n_samples,)
clf.fit(train_vectors, train_df["target"]);

See what the model predicts based on the test data:

In [59]:
predictions = clf.predict(test_vectors)
is_disaster = map(lambda tf: "Yes" if tf == 1 else "No", predictions)
pd.DataFrame({'text': test_df['text'], 'disaster?': is_disaster})

Unnamed: 0,text,disaster?
0,Just happened a terrible car crash,No
1,"Heard about #earthquake is different cities, s...",Yes
2,"there is a forest fire at spot pond, geese are...",Yes
3,Apocalypse lighting. #Spokane #wildfires,No
4,Typhoon Soudelor kills 28 in China and Taiwan,Yes
...,...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,Yes
3259,Storm in RI worse than last hurricane. My city...,Yes
3260,Green Line derailment in Chicago http://t.co/U...,Yes
3261,MEG issues Hazardous Weather Outlook (HWO) htt...,Yes


Create the submission:

In [61]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.head()
# sample_submission.to_csv("submission.csv", index=False)

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
