In [25]:
import numpy as np
import pandas as pd
import os
import re
import string
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import seaborn as sns
import plotly.figure_factory as ff
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

# Rest of your code here


In [26]:
df_train = pd.read_csv('train.csv')
df_train.drop(columns=['location', 'keyword'], inplace=True)

df_test = pd.read_csv('test.csv')
df_test.drop(columns=['location', 'keyword'], inplace=True)


In [27]:
df_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [28]:
df_test.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [29]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)

    return text

def remove_html(text):
    text = re.sub(r'<.*?>',' ',text)
    return text

def remove_mentions(text):
    text = re.sub('@\S+', '', text)
    return text

df_train['text'] = df_train['text'].apply(lambda x : remove_emoji(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_url(x))
df_train['text'] = df_train['text'].apply(lambda x : seperate_alphanumeric(x))
df_train['text'] = df_train['text'].apply(lambda x : decontraction(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_html(x))
df_train['text'] = df_train['text'].apply(lambda x : remove_mentions(x))
df_train['text'] = df_train['text'].apply(lambda x : x.lower())

df_test['text'] = df_test['text'].apply(lambda x : remove_emoji(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_url(x))
df_test['text'] = df_test['text'].apply(lambda x : seperate_alphanumeric(x))
df_test['text'] = df_test['text'].apply(lambda x : decontraction(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_html(x))
df_test['text'] = df_test['text'].apply(lambda x : remove_mentions(x))
df_test['text'] = df_test['text'].apply(lambda x : x.lower())

In [30]:
text = pd.concat([df_train['text'], df_test['text']], axis=0)

tfidf_vectorizer = TfidfVectorizer(
    min_df=2,
    stop_words='english',
    ngram_range=(1, 2)
)

vectors = tfidf_vectorizer.fit_transform(
    text
)

train_target = df_train['target']
test_ids = df_test['id']

In [31]:
vectors = pd.DataFrame(vectors.toarray())

train = vectors.iloc[:7613, :]
test = vectors.iloc[7613:, :]


# Stacking classifier

In [32]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Splitting train set for two separate sets

In [33]:
train1 = train.iloc[:5000, :]
train2 = train.iloc[5000:, :]
train_target1 = train_target[:5000]
train_target2 = train_target[5000:]

### Training first layer of classifiers - all of them must be able to return probability

#### Multinomial Naive Bayes

In [34]:
mnb = MultinomialNB()

param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(mnb, param_grid, cv=10, scoring='f1')
grid_search.fit(train1, train_target1)
mnb = grid_search.best_estimator_

#### Bernoulli Naive Bayes

In [35]:
bnb = BernoulliNB()

param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'binarize': [0.0, 0.5, 1.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(bnb, param_grid, cv=10, scoring='f1')
grid_search.fit(train1, train_target1)
bnb = grid_search.best_estimator_

#### Random Forest

In [36]:
forest = RandomForestClassifier(n_estimators=100, max_depth=15)
forest.fit(train1, train_target1)

RandomForestClassifier(max_depth=15)

#### Gradient Boosting

In [37]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(train1, train_target1)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)

In [38]:
predictions_mnb = pd.DataFrame(mnb.predict_proba(train2), index=train2.index)
predictions_bnb = pd.DataFrame(bnb.predict_proba(train2), index=train2.index)
predictions_forest = pd.DataFrame(forest.predict_proba(train2), index=train2.index)
predictions_gbc = pd.DataFrame(gbc.predict_proba(train2), index=train2.index)

train_blender = pd.DataFrame({
    'mnb': predictions_mnb[1],
    'bnb': predictions_bnb[1],
    'forest': predictions_forest[1],
    'gbc': predictions_gbc[1],
}, index=train2.index)

train_blender

Unnamed: 0,mnb,bnb,forest,gbc
5000,0.077741,0.520457,0.382884,0.223480
5001,0.077741,0.520457,0.382884,0.223480
5002,0.973864,0.520457,0.414255,0.947863
5003,0.265428,0.003195,0.401315,0.223480
5004,0.973864,0.520457,0.414255,0.947863
...,...,...,...,...
7608,0.996196,0.520457,0.583491,0.694203
7609,0.809232,0.520457,0.519159,0.719525
7610,0.825169,0.520457,0.403233,0.223480
7611,0.891229,0.520457,0.440835,0.614671


### Training blender

In [39]:
log_clf = LogisticRegression()

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

grid_search = GridSearchCV(log_clf, param_grid, cv=20, scoring='f1')
grid_search.fit(train_blender, train_target2)

blender = grid_search.best_estimator_







### Final predictions for test set

In [40]:
final_predictions = blender.predict(
    pd.DataFrame({
        'mnb': [one_proba for zero_proba, one_proba in mnb.predict_proba(test)],
        'bnb': [one_proba for zero_proba, one_proba in bnb.predict_proba(test)],
        'forest': [one_proba for zero_proba, one_proba in forest.predict_proba(test)],
        'gbc': [one_proba for zero_proba, one_proba in gbc.predict_proba(test)],
    }, index=test_ids)
)
final_predictions

array([0, 1, 1, ..., 1, 1, 1], dtype=int64)

In [41]:
final_predictions = pd.DataFrame({'target': final_predictions}, index=test_ids)
final_predictions.to_csv('stacking-classifier.csv')
final_predictions

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1
