# 1. Loading data

In [1]:
import pandas as pd
import numpy as np

from plotly import graph_objs as go

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier

import xgboost as xgb

from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import cross_val_score

In [2]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("train.csv", encoding="latin-1", index_col="id")
# test_df = pd.read_csv("./data/test.csv", encoding="latin-1")

df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


# 2. Simple EDA:

In [4]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts

array([4342, 3271])

In [5]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['Fake'],
    y=[balance_counts[0]],
    name='Fake',
    text=[balance_counts[0]],
    textposition='auto',
))
fig.add_trace(go.Bar(
    x=['Real disaster'],
    y=[balance_counts[1]],
    name='Real disaster',
    text=[balance_counts[1]],
    textposition='auto',
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()

# 3. Data pre-processing:

In [6]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [7]:
def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [8]:
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

In [9]:
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_html(text)
    
    return text

In [10]:
# Stops words

stop_words = stopwords.words('english') + ['u', 'im', 'c']

In [11]:
# Stemming

stemmer = nltk.SnowballStemmer("english")

In [12]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords and Stemm all the words in the sentence
    text = ' '.join(str(word) for word in text.split(' ') if word not in stop_words)

    return text

# 3.1 Data pré-processing: applying

In [13]:
df.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
df['text_clean'] = df['text'].apply(preprocess_data)
df.head()

Unnamed: 0_level_0,keyword,location,text,target,text_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders ca...
7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


# 4. Small analise:

In [15]:
def analising_freq(df, target, num=10):

    lista = []
    for text in df.text_clean[df.target == target]:
        for word in text.split():
            lista.append(word)


    dic={}
    for word in lista:
        if word in dic:
            dic[word]+=1
        else:
            dic[word]=1
            
    top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:num]
    
    return top

- Real disasters

In [16]:
analising_freq(df, 1)

[('fire', 177),
 ('news', 132),
 ('via', 121),
 ('disaster', 116),
 ('california', 111),
 ('suicide', 110),
 ('amp', 106),
 ('people', 105),
 ('police', 105),
 ('killed', 93)]

- Fake disasters

In [17]:
analising_freq(df, 0)

[('like', 252),
 ('amp', 192),
 ('new', 168),
 ('get', 162),
 ('dont', 139),
 ('one', 126),
 ('body', 112),
 ('via', 99),
 ('would', 95),
 ('people', 91)]

# 5. Modeling:

In [18]:
sample = pd.read_csv("sample_submission.csv", index_col="id")
df_to_predict = pd.read_csv("test.csv", encoding="latin-1")

In [19]:
df_to_predict["text_clean"] = df_to_predict['text'].apply(preprocess_data)
df_to_predict

Unnamed: 0,id,keyword,location,text,text_clean
0,0,,,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwan
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÂÃÃ SAFETY FA...,earthquake safety los angeles âãã safety fa...
3259,10865,,,Storm in RI worse than last hurricane. My city...,storm ri worse last hurricane hardest hit yar...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,green line derailment chicago
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issues hazardous weather outlook hwo


In [20]:
X = df['text_clean']
y = df['target']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

5709 5709
1904 1904


### 5.1 Modeling: XGBoost

In [22]:
pipeXGB = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',))
])

### 5.1.1 Modeling: XGBoost training

In [23]:
# Fit the pipeline with the data
pipeXGB = pipeXGB.fit(x_train, y_train)

In [24]:
y_pred_class = pipeXGB.predict(x_test)

print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print(metrics.confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class, target_names=['fake', 'disaster']))

Test: 0.7195378151260504
[[1035   56]
 [ 478  335]]
              precision    recall  f1-score   support

        fake       0.68      0.95      0.79      1091
    disaster       0.86      0.41      0.56       813

    accuracy                           0.72      1904
   macro avg       0.77      0.68      0.68      1904
weighted avg       0.76      0.72      0.69      1904



### 5.1.1.1 Modeling: XGBoost saving

In [25]:
y_submit = pipeXGB.predict(df_to_predict["text_clean"])
sample['target'] = y_submit
sample.to_csv("XGB_submission.csv")

### 5.2 Modeling: DecisionTreeClassifier

In [26]:
pipeDTC = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', DecisionTreeClassifier(
    ))
])

### 5.2.1 Modeling: DecisionTreeClassifier training

In [27]:
# Fit the pipeline with the data
pipeDTC = pipeDTC.fit(x_train, y_train)

In [28]:
y_pred_class = pipeDTC.predict(x_test)

print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print(metrics.confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class, target_names=['fake', 'disaster']))

Test: 0.7226890756302521
[[816 275]
 [253 560]]
              precision    recall  f1-score   support

        fake       0.76      0.75      0.76      1091
    disaster       0.67      0.69      0.68       813

    accuracy                           0.72      1904
   macro avg       0.72      0.72      0.72      1904
weighted avg       0.72      0.72      0.72      1904



### 5.2.1.1 Modeling: DecisionTreeClassifier saving

In [29]:
y_submit = pipeDTC.predict(df_to_predict["text_clean"])
sample['target'] = y_submit
sample.to_csv("DCT_submission.csv")

### 5.3 Modeling: GradientBoostingClassifier

In [30]:
pipeGBC = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', GradientBoostingClassifier(
    ))
])

### 5.3.1 Modeling: GradientBoostingClassifier training

In [31]:
# Fit the pipeline with the data
pipeGBC = pipeGBC.fit(x_train, y_train)

In [32]:
y_pred_class = pipeGBC.predict(x_test)

print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print(metrics.confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class, target_names=['fake', 'disaster']))

Test: 0.7358193277310925
[[1041   50]
 [ 453  360]]
              precision    recall  f1-score   support

        fake       0.70      0.95      0.81      1091
    disaster       0.88      0.44      0.59       813

    accuracy                           0.74      1904
   macro avg       0.79      0.70      0.70      1904
weighted avg       0.77      0.74      0.71      1904



### 5.3.1.1 Modeling: GradientBoostingClassifier saving

In [33]:
y_submit = pipeGBC.predict(df_to_predict["text_clean"])
sample['target'] = y_submit
sample.to_csv("GBC_submission.csv")

### 5.4 Modeling: RidgeClassifier

In [34]:
pipeRC = Pipeline([
    ('count', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', RidgeClassifier(

    ))
])

### 5.4.1 Modeling: RidgeClassifier training

In [35]:
# Fit the pipeline with the data
pipeRC = pipeRC.fit(x_train, y_train)

In [36]:
y_pred_class = pipeRC.predict(x_test)

print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
print(metrics.confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class, target_names=['fake', 'disaster']))

Test: 0.8014705882352942
[[938 153]
 [225 588]]
              precision    recall  f1-score   support

        fake       0.81      0.86      0.83      1091
    disaster       0.79      0.72      0.76       813

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



### 5.4.1.1 Modeling: RidgeClassifier saving

In [37]:
y_submit = pipeRC.predict(df_to_predict["text_clean"])
sample['target'] = y_submit
sample.to_csv("RC_submission.csv")

- Melhor resultado RidgeClassifier

# 6. Stacking:

In [38]:
def pre_process(train, test, predict):
  pipeSTACK = Pipeline([
      ('count', CountVectorizer()), 
      ('tfid', TfidfTransformer())
  ])

  train = pipeSTACK.fit_transform(train).toarray()
  test = pipeSTACK.transform(test).toarray()
  predict = pipeSTACK.transform(predict).toarray()
  return train, test, predict

x_train_stack, x_test_stack, to_predict = pre_process(x_train, x_test, df_to_predict.text_clean)

In [39]:
RC = RidgeClassifier()

XGBoost = xgb.XGBClassifier()

DC = DecisionTreeClassifier()

GBC = GradientBoostingClassifier()

#stack
stack_gen=StackingCVRegressor(
    regressors=(RC, DC, XGBoost,GBC),
    meta_regressor=RC,
    use_features_in_secondary=True
)

In [None]:
#scoring

print("cross validated scores")

for model,label in zip([RC, DC, XGBoost, GBC, stack_gen],['RidgeClassifier','DecisionTreeClassifier', 'XGBClassifier','GradientBoostingClassifier','StackingCVRegressor']):
    
    SG_scores = cross_val_score(model, x_train_stack, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    print("RMSE", np.sqrt(-SG_scores.mean()), label)

cross validated scores
RMSE 0.46773682088002616 RidgeClassifier
RMSE 0.5118985137197314 XGBClassifier
RMSE 0.5400430497727956 DecisionTreeClassifier
RMSE 0.5252435977792821 GradientBoostingClassifier


In [40]:
RC.fit(x_train_stack, y_train)
ridge_preds = RC.predict(x_test_stack)

print('Test: {}'.format(metrics.accuracy_score(y_test, ridge_preds)))
print(metrics.confusion_matrix(y_test, ridge_preds))
print(classification_report(y_test, ridge_preds, target_names=['fake', 'disaster']))

Test: 0.8014705882352942
[[938 153]
 [225 588]]
              precision    recall  f1-score   support

        fake       0.81      0.86      0.83      1091
    disaster       0.79      0.72      0.76       813

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



In [41]:
DC.fit(x_train_stack, y_train)
DC_preds = DC.predict(x_test_stack)

print('Test: {}'.format(metrics.accuracy_score(y_test, DC_preds)))
print(metrics.confusion_matrix(y_test, DC_preds))
print(classification_report(y_test, DC_preds, target_names=['fake', 'disaster']))

Test: 0.7300420168067226
[[834 257]
 [257 556]]
              precision    recall  f1-score   support

        fake       0.76      0.76      0.76      1091
    disaster       0.68      0.68      0.68       813

    accuracy                           0.73      1904
   macro avg       0.72      0.72      0.72      1904
weighted avg       0.73      0.73      0.73      1904



In [42]:
XGBoost.fit(x_train_stack, y_train)
XGB_preds = XGBoost.predict(x_test_stack)

print('Test: {}'.format(metrics.accuracy_score(y_test, XGB_preds)))
print(metrics.confusion_matrix(y_test, XGB_preds))
print(classification_report(y_test, XGB_preds, target_names=['fake', 'disaster']))

Test: 0.7121848739495799
[[1036   55]
 [ 493  320]]
              precision    recall  f1-score   support

        fake       0.68      0.95      0.79      1091
    disaster       0.85      0.39      0.54       813

    accuracy                           0.71      1904
   macro avg       0.77      0.67      0.66      1904
weighted avg       0.75      0.71      0.68      1904



In [44]:
GBC.fit(x_train_stack, y_train)
GBC_preds = GBC.predict(x_test_stack)

print('Test: {}'.format(metrics.accuracy_score(y_test, GBC_preds)))
print(metrics.confusion_matrix(y_test, GBC_preds))
print(classification_report(y_test, GBC_preds, target_names=['fake', 'disaster']))

Test: 0.7321428571428571
[[1028   63]
 [ 447  366]]
              precision    recall  f1-score   support

        fake       0.70      0.94      0.80      1091
    disaster       0.85      0.45      0.59       813

    accuracy                           0.73      1904
   macro avg       0.78      0.70      0.70      1904
weighted avg       0.76      0.73      0.71      1904



In [45]:
stack_gen.fit(x_train_stack, y_train)
stack_gen_preds = stack_gen.predict(x_test_stack)

print('Test: {}'.format(metrics.accuracy_score(y_test, stack_gen_preds)))
print(metrics.confusion_matrix(y_test, stack_gen_preds))
print(classification_report(y_test, stack_gen_preds, target_names=['fake', 'disaster']))

KeyError: ignored

In [None]:
stack_preds = ((0.2*DC_preds)+(0.2*GBC_preds)+(0.25*ridge_preds)+(0.1*XGB_preds)+(0.25*stack_gen_preds))

print('Test: {}'.format(metrics.accuracy_score(y_test, stack_gen_preds)))
print(metrics.confusion_matrix(y_test, stack_gen_preds))
print(classification_report(y_test, stack_gen_preds, target_names=['fake', 'disaster']))

In [None]:
stack_preds

In [None]:
y_submit = pipeRC.predict(df_to_predict["text_clean"])
sample['target'] = y_submit
sample.to_csv("RC_submission.csv")


sample_submission['SalePrice'] = np.expm1(stack_preds)
sample_submission.to_csv("last_one.csv")