## **Import Libraries / Load Raw Data**

In [0]:
%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals

#Import TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#import helper
import spacy
from keras.preprocessing.text import Tokenizer
from autocorrect import Speller

import numpy as np
import statistics 
import pandas as pd
import re
import string
import random

nlp = spacy.load('en',disable=['parser','tagger','ner'])

Using TensorFlow backend.


In [0]:
#Training data
train = pd.read_csv('train.csv',encoding='utf-8')
print('Training data shape: ', train.shape)
train.head()

Training data shape:  (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [0]:
# Testing data 
test = pd.read_csv('test.csv')
print('Testing data shape: ', test.shape)
test.head(10)

Testing data shape:  (3263, 4)


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [0]:
test[test['keyword'].isna()].count

In [0]:
df_leak = pd.read_csv('total.csv', encoding ='ISO-8859-1')[['choose_one', 'text']]

# Creating target and id
df_leak['target'] = (df_leak['choose_one'] == 'Relevant').astype(np.int8)
df_leak['id'] = df_leak.index.astype(np.int16)
df_leak.drop(columns=['choose_one', 'text'], inplace=True)

# Merging target to test set
test = test.merge(df_leak, on=['id'], how='left')

## **Combine Keyword and Data**

In [0]:
def combine_attributes(text, keyword):
    var_list = [text, keyword]
    combined = ' '.join(x for x in var_list if x)
    return combined

train.fillna('', inplace=True)
train['combine'] = train.apply(lambda x: combine_attributes(x['text'], x['keyword']), axis=1)
test.fillna('', inplace=True)
test['combine'] = test.apply(lambda x: combine_attributes(x['text'], x['keyword']), axis=1)

In [0]:
train['combine'][:100]
test['combine'][:100]

0                    Just happened a terrible car crash
1     Heard about #earthquake is different cities, s...
2     there is a forest fire at spot pond, geese are...
3              Apocalypse lighting. #Spokane #wildfires
4         Typhoon Soudelor kills 28 in China and Taiwan
                            ...                        
95    'If your nature appropriates it love will burn...
96    @NinaHoag - 'if you shred my Psych work our fr...
97    @thehill this is 1 example of y the Conservati...
98    Aug 3 1915ÛÓKAISERJAEGERS WIPED OUT.; Francis...
99    They should all die! All of them! Everything a...
Name: combine, Length: 100, dtype: object

## **Clean training set**

In [0]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[#@!?|,-_()+{}.~*]', '', text)
    text = re.sub('%', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7f]','', text) 
    text = check(text)
    return text

In [0]:
def separate_punc(doc_text):
  return [(token.text.lower(), token.lemma_) for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"\'-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [0]:
basic_stopwords = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','v','w','x','y','z',
               'he','she','they','us','we','it','   ','u','w.','one','two','three','four', 'five',
               'six', 'seven', 'eight', 'nine','ten','from', 'every','just','s','a','f','be','go','this','that','\ufeff1',
               'whatever"ed','a.','the','his','her','our','them','set','ing','of','text','at']

In [0]:
tmp = train[train['keyword'].notna()]['keyword'].unique().tolist()
stopword_from_keyword = clean_text(' '.join(tmp)).split(" ")
len(stopword_from_keyword)
stopwords = basic_stopwords #+ stopword_from_keyword

In [0]:
def create_bag_of_words(x):
  token = separate_punc(x)

  bag_of_word = set()
  for t in token:
    if(not(t[0].isdigit()) and t[1] != '-PRON-' and not(t[1].lower() in stopwords) and len(t[1])> 2 and len(t[1]) < 20):
      bag_of_word.add(t[1].lower())
  #2506
  return bag_of_word

In [0]:
pd.set_option('display.max_colwidth', -1)

First Clean Text

In [0]:
# Applying the cleaning function to both test and training datasets
train['combine'] = train['combine'].apply(lambda x: clean_text(x))
test['combine'] = test['combine'].apply(lambda x: clean_text(x))

In [0]:
#shuffle
random.seed(1) 
train_size = train['combine'].shape[0]
shuffle = np.random.choice(train_size,train_size)
shuffle_x = train['combine'][shuffle]
shuffle_y = train['target'][shuffle]

# Divided train and validation set
train_shuffle_x = shuffle_x[:7000]
valid_shuffle_x = shuffle_x[7000:]

train_y = shuffle_y[:7000]
valid_y = shuffle_y [7000:]

print(len(train_shuffle_x))
print(len(valid_shuffle_x))

7000
613


In [0]:
test_x = test['combine']
test_y = test['target']
print(len(test_x))

3263


In [0]:
f_union = ' '.join(train_shuffle_x.tolist())
bag_of_words = create_bag_of_words(f_union)

In [0]:
len(bag_of_words)

8731

## **Disaster / Non Disaster Tweet Analysis (1)**

In [0]:
def create_freq_dict(x):
  freq_dict = {}
  token = separate_punc(x)

  for t in token:
    lemma = t[1]
    if t[1] in freq_dict:
      freq_dict[lemma] += 1
    else:
      freq_dict[lemma] = 1

  return freq_dict

In [0]:
disaster_tweets = ' '.join(train[train['target']==1]['combine'])
non_disaster_tweets = ' '.join(train[train['target']==0]['combine'])

In [0]:
disaster_dict = create_freq_dict(disaster_tweets)
non_disaster_dict = create_freq_dict(non_disaster_tweets)

In [0]:
print(len(disaster_dict))
print(len(non_disaster_dict))

7459
10214


In [0]:
one_disaster_dict = [k for k in disaster_dict if disaster_dict[k]>1]
len(one_disaster_dict)

2941

In [0]:
shared_items = {k: disaster_dict[k] for k in disaster_dict if k in non_disaster_dict and disaster_dict[k] == non_disaster_dict[k]}
len(shared_items)

734

In [0]:
selective_bow = [ k for k in bag_of_words if not(k in shared_items)]
print(len(selective_bow))
bag_of_words = selective_bow

8482


In [0]:
'reason' in bag_of_words

True

In [0]:
disasterDF = pd.DataFrame(disaster_dict.items(), columns=['word', 'count'])
nonDisasterDF = pd.DataFrame(non_disaster_dict.items(), columns=['word', 'count'])
df = pd.merge(left=disasterDF,right=nonDisasterDF, how='outer', left_on='word', right_on='word')
df[(df.count_x == 1) & (df.count_y == 1)]
df[(df.count_x == df.count_y)]
df1 = df.copy()
df1['std'] = df.std(numeric_only=True, axis=1)
df1['avg'] = df.mean(numeric_only=True, axis=1)
df1['covar'] = df1['std'] / df1['avg']
print(df1.count)

<bound method DataFrame.count of                    word  count_x  count_y         std     avg     covar
0                   our     39.0     62.0   16.263456    50.5  0.322049
1                  deed      1.0      1.0    0.000000     1.0  0.000000
2                    be   1221.0   1986.0  540.936688  1603.5  0.337347
3                   the   1371.0   1918.0  386.787409  1644.5  0.235201
4                reason      8.0     23.0   10.606602    15.5  0.684297
...                 ...      ...      ...         ...     ...       ...
12781             truff      NaN      1.0         NaN     1.0       NaN
12782     cameronhacker      NaN      1.0         NaN     1.0       NaN
12783           shutout      NaN      1.0         NaN     1.0       NaN
12784     gameofkittens      NaN      1.0         NaN     1.0       NaN
12785  explodingkittens      NaN      1.0         NaN     1.0       NaN

[12786 rows x 6 columns]>


In [0]:
filtered_word_covar = df1[(df1.covar > 0.1)|(df1['covar'].isna() & df['count_y'].isna())]['word'].tolist()
len(filtered_word_covar)

6123

In [0]:
df1[(df1['covar'].isna() & df['count_y'].isna())]

In [0]:
df1[df1['word']=='terrible']

Unnamed: 0,word,count_x,count_y,std,avg,covar
1113,terrible,3.0,4.0,0.707107,3.5,0.202031


In [0]:
selective_bow2 = [ k for k in bag_of_words if (k in filtered_word_covar)]

In [0]:
len(selective_bow2)

4650

In [0]:
bag_of_words = selective_bow2

## **Tokenize Train and Test data**

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(bag_of_words)

In [0]:
print(len(bag_of_words))
print(len(tokenizer.word_index))
print(len(tokenizer.index_word))
print('a' in tokenizer.word_index)
token_len = len(tokenizer.word_index)

8731
8715
8715
False


In [0]:
def create_data_to_token(shuffle_x):
  token_x = []
  for single_train in shuffle_x.tolist():
    token_x.append([token.lemma_.lower() for token in nlp(single_train) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '])
  return token_x

In [0]:
train_token_x = create_data_to_token(train_shuffle_x)
valid_token_x = create_data_to_token(valid_shuffle_x)
test_token_x = create_data_to_token(test_x)

In [0]:
train_idx_x = np.array(tokenizer.texts_to_sequences(train_token_x))
valid_idx_x = np.array(tokenizer.texts_to_sequences(valid_token_x))
test_idx_x = np.array(tokenizer.texts_to_sequences(test_token_x))

In [0]:
train_x=tf.keras.preprocessing.sequence.pad_sequences(train_idx_x)
print(train_x.shape)
train_y = np.asarray(train_y)

(7000, 23)


In [0]:
valid_x=tf.keras.preprocessing.sequence.pad_sequences(valid_idx_x, maxlen=train_x.shape[1])
print(valid_x.shape)
valid_y = np.asarray(valid_y)

(613, 23)


In [0]:
test_x=tf.keras.preprocessing.sequence.pad_sequences(test_idx_x, maxlen = train_x.shape[1])
print(test_x.shape)
test_y = np.asarray(test_y)

(3263, 23)


## **Embedding**

In [0]:
EMBED_SIZE = 100
VOCAB_LEN = len(tokenizer.word_index)+1

In [0]:
def create_embedding_matrix(VOCAB_LEN, EMBED_SIZE, token_word_idx):
  embedding_dict={}
  with open('glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
  f.close()

  embedding_matrix=np.zeros((VOCAB_LEN,EMBED_SIZE))
  for word, idx in token_word_idx.items():
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[idx]=emb_vec
  
  return embedding_matrix

In [0]:
embedding_matrix= create_embedding_matrix(VOCAB_LEN, EMBED_SIZE, tokenizer.word_index)

## **Train Deep learning Model**

In [0]:
from keras import regularizers

In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_LEN,EMBED_SIZE,embeddings_initializer=tf.constant_initializer(embedding_matrix),trainable=False),                
    tf.keras.layers.Conv1D(256, 3, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding='valid'),
    tf.keras.layers.Conv1D(256, 3, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding='valid'),
    tf.keras.layers.Dropout(0.5),    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
    
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         871600    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 256)         77056     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 256)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 256)         196864    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 256)         0         
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               3

In [0]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [0]:
# batchsize (32,13) (4,16)
history = model.fit(train_x, train_y ,epochs=13,batch_size=32,
                    validation_data=(valid_x, valid_y))

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


In [0]:
model.save('mymodel', save_format='tf')

INFO:tensorflow:Assets written to: mymodel1/assets


In [0]:
model.predict(test_x[0:1])

array([[-1.3628203]], dtype=float32)

In [0]:
result = model.evaluate(test_x, test_y)
dict(zip(model.metrics_names, result))



{'accuracy': 0.794361, 'loss': 0.7784673411934573}

In [0]:
# Evaluate the model on the test data using `evaluate`
predictions = model.predict(test_x)
print('predictions shape:', predictions.shape)
classification = [ 1 if x > 0.5 else 0 for x in predictions]

predictions shape: (3263, 1)


In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [0]:
f1_score(test_y, classification, average='micro')

0.7857799570946982

## **SVM**

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def create_vector_for_svm(token_x):
  vector_x = []
  for single in token_x:
    vector_x.append(' '.join(single))
   # vector_x.append(' '.join(x for in token_x))
  return vector_x

In [0]:
vectorizer = TfidfVectorizer()
tmp_x = create_vector_for_svm(train_token_x)
X_train_vect = vectorizer.fit_transform(tmp_x)

In [0]:
tmp_test_x = create_vector_for_svm(test_token_x)
X_test_vect = vectorizer.transform(tmp_test_x)

In [0]:
tmp_x[0]

'long streak of tripledigit heat since forecast in dallas a unrelenting and dangerous heat wave will heat wave'

In [0]:
first_vector_tfidfvectorizer = X_train_vect[0]
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

In [0]:
parameters = { 
    'gamma': [0.7, 1, 'auto', 'scale']
}
model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=4, n_jobs=-1).fit(X_train_vect, train_y)

In [0]:
y_test_pred = model.predict(X_test_vect)
y_test_pred[:100]

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [0]:
f1_score(test_y, y_test_pred , average='micro')

0.7891510879558689

## **Logistic regression**

In [0]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=150,penalty='l2',solver='lbfgs',random_state=0)
lr_clf.fit(X_train_vect, train_y)
lr_pred = lr_clf.predict(X_test_vect)

#print('accuracy score: ',accuracy_score(lr_pred,y_val))
#print(classification_report(y_val, lr_pred))

In [0]:
lr_pred

array([1, 1, 1, ..., 1, 1, 1])

In [0]:
f1_score(test_y, lr_pred , average='micro')

0.7836346920012258

In [0]:
f1_score(test_y, classification, average='micro')

0.7943610174685872

# **Consolidate All trained models and average**

In [0]:
total_classification = []
for i in range(len(test_y)):
  if((classification1[i]) == (y_test_pred[i]) and (classification1[i] == lr_pred[i])):
    total_classification.append(classification1[i])
  elif((classification1[i] == y_test_pred[i]) and (classification1[i] != lr_pred[i])):
    total_classification.append(classification1[i])
  elif((classification1[i] == lr_pred[i]) and (classification1[i] != y_test_pred[i])):
    total_classification.append(classification1[i])
  else:
    total_classification.append(y_test_pred[i])


In [0]:
f1_score(test_y, total_classification, average='micro')

0.7906834201654919

## **Submission**

In [0]:
len(classification)
sample_sub=pd.read_csv('sample_submission.csv')

In [0]:
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':total_classification_tmp})

In [0]:
sub

In [0]:
sub.to_csv('submission.csv',index=False)