In [21]:
import os
import pandas as pd
import re

path = os.path.join('datasets', 'NLP_Desaster_Tweets')
train_path = os.path.join(path, 'tweets_desaster_train.csv')
test_path = os.path.join(path, 'tweets_desaster_test.csv')
sample_path = os.path.join(path, 'sample_submission.csv')

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [22]:
import numpy as np
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

train_data[train_data['target']==1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

In [23]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

train_data['text']=train_data['text'].apply(lambda x: remove_url(x))


In [24]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

train_data['text']=train_data['text'].apply(lambda x: remove_html(x))


In [8]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


train_data['text']=train_data['text'].apply(lambda x: remove_emoji(x))

In [25]:
import string

def remove_punctuation(text):
    table= str.maketrans('', '', string.punctuation)
    return text.translate(table)

train_data['text']=train_data['text'].apply(lambda x: remove_punctuation(x))

In [10]:
from spellchecker import SpellChecker  # depending on your System this will take some while

spell = SpellChecker()
def correct_spelling(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return ''.join(corrected_text)


train_data['text']=train_data['text'].apply(lambda x : correct_spelling(x))

In [26]:
count_vectorizer = feature_extraction.text.CountVectorizer()

example_train_vector = count_vectorizer.fit_transform(train_data['text'][0:5])

print(example_train_vector[0].todense().shape)
print(example_train_vector[0].todense())

(1, 53)
[[0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [27]:
train_vectors = count_vectorizer.fit_transform(train_data['text'])
test_vectors = count_vectorizer.transform(test_data['text'])

In [28]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

log_clf = LogisticRegression()
ridge_clf = RidgeClassifier()
forest_clf = RandomForestClassifier()
svm_clf = SVC()

In [29]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', forest_clf), ('svc', svm_clf), ('ridge', ridge_clf)], voting='hard')

voting_clf.fit(train_vectors, train_data['target'])

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC()),
                             ('ridge', RidgeClassifier())])

In [30]:
from sklearn.model_selection import cross_val_score

for clf in (log_clf, forest_clf, svm_clf, ridge_clf, voting_clf):
    clf.fit(train_vectors, train_data['target'])
    scores = cross_val_score(clf, train_vectors, train_data['target'], cv=3, scoring='f1')
    print(clf.__class__.__name__, scores)

RandomForestClassifier [0.53035144 0.50089445 0.59010802]
SVC [0.59710495 0.55477855 0.64382817]
RidgeClassifier [0.61409043 0.55186104 0.61568627]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier [0.5731782  0.52941176 0.61452514]


In [31]:
sample_submission = pd.read_csv(sample_path)

sample_submission['target'] = voting_clf.predict(test_vectors)

sample_submission.to_csv('submission_v3.csv', index=False)