In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [3]:
train_df = pd.read_csv("dataset\\train.csv")
test_df = pd.read_csv("dataset\\test.csv")

In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
train_df.shape, test_df.shape

((7613, 5), (3263, 4))

In [10]:
train_df.iloc[0]

id                                                          1
keyword                                                   NaN
location                                                  NaN
text        Our Deeds are the Reason of this #earthquake M...
target                                                      1
Name: 0, dtype: object

In [11]:
test_df.iloc[0]

id                                           0
keyword                                    NaN
location                                   NaN
text        Just happened a terrible car crash
Name: 0, dtype: object

# Building Word Vectors

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score

In [17]:
count_vectorizer = CountVectorizer()

# let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df['text'][0:5])

In [15]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


The above tells us that
1. There are 54 unique words (or "tokens") in the first five tweets.
2. The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet.


In [18]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])

test_vectors = count_vectorizer.transform(test_df['text'])

In [19]:
train_vectors.todense().shape

(7613, 21637)

In [20]:
test_vectors.todense().shape

(3263, 21637)

# Model Development

In [23]:
# the word vectors are big, so ridge regression is used t opush the model's weight towards 0 withotu completely discounting different words

clf = RidgeClassifier()

In [26]:
scores = cross_val_score(clf, train_vectors, train_df['target'], cv=3, scoring="f1")
scores

array([0.59453669, 0.5642787 , 0.64082434])

In [27]:
clf.fit(train_vectors, train_df['target'])

In [28]:
sample_submission = pd.read_csv("dataset\\sample_submission.csv")
sample_submission['target'] = clf.predict(test_vectors)
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [29]:
sample_submission.to_csv("submission.csv", index=False)