In [1]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from pandas_profiling import ProfileReport
import warnings
from geopy.geocoders import ArcGIS, Bing, Nominatim, OpenCage, GoogleV3, OpenMapQuest
import csv, sys
from geopy.geocoders import Nominatim 
import multiprocessing
import random as rd
from geopy.extra.rate_limiter import RateLimiter
from time import sleep
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import logging
import requests
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, wordpunct_tokenize, pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')


In [2]:
file_path=os.path.join("..","data","nlp-disaster","train.csv")
data=pd.read_csv(file_path)
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [27]:
data[data["target"]==1].head()

Unnamed: 0,id,keyword,location,text,target,new_location
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Italia
1,4,,,Forest fire near La Ronge Sask. Canada,1,Italia
2,5,,,All residents asked to 'shelter in place' are ...,1,Italia
3,6,,,"13,000 people receive #wildfires evacuation or...",1,Italia
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Italia


In [28]:
data[data["target"]==0].head()

Unnamed: 0,id,keyword,location,text,target,new_location
15,23,,,What's up man?,0,Italia
16,24,,,I love fruits,0,Italia
17,25,,,Summer is lovely,0,Italia
18,26,,,My car is so fast,0,Italia
19,28,,,What a goooooooaaaaaal!!!!!!,0,Italia


In [None]:
profile = ProfileReport(data, title='Profiling Report', explorative=True)
profile.to_widgets()

In [45]:
arcgis = ArcGIS(timeout=50)
nominatim = Nominatim(timeout=50,user_agent="bogchalaca")
googlev3 = GoogleV3(timeout=50)

# choose and order your preference for geocoders here
geocoders = [nominatim]
def geocode(address):
    i = 0
    try:
        while i < len(geocoders):
            # try to geocode using a service
            location = geocoders[i].geocode(address)
            # if it returns a location
            if location != None:
                
                country=location[0].split(",")[-1]
                return country
            else:
                i += 1
    except:
        return np.nan

pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
#addresses = pool.map(geocode_worker, data["location"])
result=[]
for _ in tqdm(pool.imap(geocode, data["location"],chunksize=1), total=len(data["location"])):
    result.append(_)
    pass

100%|██████████| 7613/7613 [1:14:21<00:00,  1.71it/s]


In [58]:
import pickle
with open('location.pkl', 'wb') as f:
    pickle.dump(result, f)

In [3]:
import pickle

with open("location.pkl","rb") as f:
    result=pickle.load(f)

In [4]:
new_result=pd.DataFrame(result)
new_result.replace("Italia","None",inplace=True)

In [5]:
data["new_location"]=new_result
data_sample=data[data["keyword"].notna()].reset_index(drop=True)
final_data=pd.concat([data_sample,pd.get_dummies(data_sample["new_location"])],axis=1)
final_data=pd.concat([data_sample,pd.get_dummies(data_sample["keyword"],prefix="keyword")],axis=1)
y=final_data["target"]
text=final_data["text"]
final_data.drop(["keyword","new_location","id","location","target","text"],axis=1,inplace=True)

In [6]:
stop_words = stopwords.words('english')

def clean_data(quote):
    quote = quote.lower()
    tokens = word_tokenize(quote)
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(w) for w in tokens]
    token_punc = [t for t in lem_words if t.isalpha()]
    token_stop = [t for t in token_punc if t not in stop_words]
    return " ".join(token_stop)


text = pd.DataFrame(text.apply(lambda x: clean_data(x)))

In [7]:
vectorizer = TfidfVectorizer(stop_words='english',min_df=0.01)
tf_idf = vectorizer.fit_transform(text["text"]).toarray()
tf_idf=pd.DataFrame(tf_idf,columns=vectorizer.get_feature_names())
final_data=pd.concat([final_data,tf_idf],axis=1)

In [38]:
final_data.head()

Unnamed: 0,keyword_ablaze,keyword_accident,keyword_aftershock,keyword_airplane%20accident,keyword_ambulance,keyword_annihilated,keyword_annihilation,keyword_apocalypse,keyword_armageddon,keyword_army,...,war,watch,way,weapon,wildfire,woman,work,world,year,youtube
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
final_data.shape

(7552, 285)

In [8]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

final_data["polarity"] = data_sample["text"].apply(pol)
final_data["subjectivity"] = data_sample["text"].apply(sub)

In [9]:
import pickle
with open('final_data.pkl', 'wb') as f:
    pickle.dump(final_data, f)

In [10]:
import pickle

with open("final_data.pkl","rb") as f:
    final_data=pickle.load(f)

In [11]:
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train,X_test,y_train,y_test=tts(final_data,y,train_size=0.8)
rfc=RandomForestClassifier(n_estimators=10,max_depth=30)
rfc.fit(X_train,y_train)
ypred=rfc.predict(X_test)
accuracy_score(y_test,ypred)


0.7399073461283918

In [12]:
from lightgbm import LGBMClassifier
lgb=LGBMClassifier(learning_rate=0.5)
lgb.fit(X_train,y_train)
ypred=lgb.predict(X_test)
accuracy_score(y_test,ypred)


0.7557908669755129

In [48]:
import tensorflow as tf

# Define a function
def model(input_dim):
    # Create the Sequential model
    model = tf.keras.models.Sequential()
  
    
    # Use a relu activation function
    model.add(tf.keras.layers.Dense(50, input_dim=input_dim, activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))

    # Final layer is sigmoid for binary classification
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    # return the model
    return model  

my_model=model(X_train.shape[1])

In [49]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard

my_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='val_loss', patience=20),
            TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)]
my_model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs=20, batch_size=16,callbacks=callbacks)
ypred = (my_model.predict(X_test)>=0.5)*1
accuracy_score(y_test,ypred)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


0.7438782263401721

In [19]:
from transformers import DistilBertTokenizer
from transformers import BertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained("bert-base-uncased")


def process_comments(tokenizer, comments, max_length):
    input_ids, attention_mask = [], []
    for comment in tqdm(comments):
        proccessed_comment = tokenizer.encode_plus(comment, max_length=max_length, pad_to_max_length=True)
        input_ids.append(proccessed_comment["input_ids"])
        attention_mask.append(proccessed_comment["attention_mask"])
    return input_ids, attention_mask

max_length=50
input_ids, attention_mask = process_comments(tokenizer, data_sample["text"], max_length)
input_ids=torch.tensor(input_ids)
attention_mask=torch.tensor(attention_mask)
y = torch.tensor(y,dtype=torch.float32)
y=torch.reshape(y,(-1,))

  0%|          | 0/7552 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 7552/7552 [00:06<00:00, 1094.71it/s]


In [20]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset,DataLoader

(train_inputs,test_inputs, 
 train_mask, test_mask, 
 train_targets, test_targets) = train_test_split(input_ids, attention_mask, y, test_size=0.5)
train_data = TensorDataset(train_inputs, train_mask, train_targets)
test_data = TensorDataset(test_inputs, test_mask, test_targets)

In [15]:
y.shape

torch.Size([7552])

In [1]:
from transformers import DistilBertForSequenceClassification
from torch.utils.data import Dataset, TensorDataset,DataLoader
from tqdm import tqdm

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)
train_loader = DataLoader(train_data,batch_size=8, shuffle=True)

NUM_EPOCHS = 1
LEARNING_RATE = 0.01
optimizer =torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCEWithLogitsLoss()
for i in range(NUM_EPOCHS):
    model.train()
    count=0
    for X_batch,X_attention_batch,y_batch in train_loader:
        print(count)
        output =   model(X_batch,attention_mask=X_attention_batch,labels=None)
        y_pred = output[0]
        y_pred = torch.reshape(y_pred,(-1,))
        loss = loss_fn(y_pred,y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        count+=8

  return torch._C._cuda_getDeviceCount() > 0
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_class

NameError: name 'train_data' is not defined

In [None]:
test_dataset = TensorDataset(X_test, X_test_attention)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))preds = np.zeros([len(test_dataset), 1])
model.eval()
for i, (x_batch, x_mask) in enumerate(test_loader):
    outputs = model(x_batch.to(device),attention_mask=x_mask.to(device))
    y_pred = sigmoid(outputs[0].detach().cpu().numpy())
    preds[i*16:(i+1)*16, :] = y_predprint(metrics.roc_auc_score(y_test, preds))

print(metrics.roc_auc_score(y_test, preds))