<a href="https://colab.research.google.com/github/Hidayathamir/Natural-Language-Processing-with-Disaster-Tweets/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re
import string
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Load file

In [3]:
file_train = pd.read_csv('train.csv')
file_test = pd.read_csv('test.csv')

df_train = file_train[['text', 'target']]
df_train = df_train.drop_duplicates(keep='first')
df_test = file_test[['text']]

In [4]:
df_train

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7604,#WorldNews Fallen powerlines on G:link tram: U...,1
7605,on the flip side I'm at Walmart and there is a...,1
7606,Suicide bomber kills 15 in Saudi security site...,1
7608,Two giant cranes holding a bridge collapse int...,1


In [5]:
df_train.target.value_counts()

0    4315
1    3206
Name: target, dtype: int64

# Preprocess

In [6]:
pstem = PorterStemmer()
def clean_text(text):
  text = text.lower()
  text = re.sub('[0-9]', '', text)
  text = ''.join([char for char in text if char not in string.punctuation])
  tokens = word_tokenize(text)
  tokens = [pstem.stem(word) for word in tokens]
  text = ' '.join(tokens)
  return text

In [7]:
df_train['clean'] = df_train['text'].apply(clean_text)
df_train[['text', 'clean', 'target']]

Unnamed: 0,text,clean,target
0,Our Deeds are the Reason of this #earthquake M...,our deed are the reason of thi earthquak may a...,1
1,Forest fire near La Ronge Sask. Canada,forest fire near la rong sask canada,1
2,All residents asked to 'shelter in place' are ...,all resid ask to shelter in place are be notif...,1
3,"13,000 people receive #wildfires evacuation or...",peopl receiv wildfir evacu order in california,1
4,Just got sent this photo from Ruby #Alaska as ...,just got sent thi photo from rubi alaska as sm...,1
...,...,...,...
7604,#WorldNews Fallen powerlines on G:link tram: U...,worldnew fallen powerlin on glink tram updat f...,1
7605,on the flip side I'm at Walmart and there is a...,on the flip side im at walmart and there is a ...,1
7606,Suicide bomber kills 15 in Saudi security site...,suicid bomber kill in saudi secur site mosqu r...,1
7608,Two giant cranes holding a bridge collapse int...,two giant crane hold a bridg collaps into near...,1


# Feature Extraction

In [8]:
tfidf = TfidfVectorizer(sublinear_tf=True, max_features=60000, min_df=1,
                        norm='l2', ngram_range=(1,2))
features = tfidf.fit_transform(df_train.clean).toarray()
features.shape

(7521, 60000)

# Train & Evaluation

In [9]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=23022021)
y = df_train['target']
accuracy = []
best_acc = 0
for trn_idx, test_idx in skf.split(features, y):
  x_tr, x_val = features[trn_idx], features[test_idx]
  y_tr, y_val = y.iloc[trn_idx], y.iloc[test_idx]
  model = LogisticRegression(max_iter=1000, C=3)
  model.fit(x_tr, y_tr)
  pred = model.predict(x_val)
  acc = accuracy_score(y_val, pred)
  accuracy.append(acc)
  if acc > best_acc:
    best_acc = acc
    best_model = model
    print(acc * 100)

79.53216374269006
80.85106382978722


In [10]:
np.mean(accuracy) * 100

80.22878561652358

# Prediction

In [11]:
df_test['clean'] = df_test['text'].apply(clean_text)
features_test = tfidf.transform(df_test.clean).toarray()
pred_test = best_model.predict(features_test)
df_test['prediction'] = pred_test
df_test

Unnamed: 0,text,clean,prediction
0,Just happened a terrible car crash,just happen a terribl car crash,1
1,"Heard about #earthquake is different cities, s...",heard about earthquak is differ citi stay safe...,1
2,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond gees are f...,1
3,Apocalypse lighting. #Spokane #wildfires,apocalyps light spokan wildfir,1
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill in china and taiwan,1
...,...,...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquak safeti lo angel ûò safeti fasten xrwn,1
3259,Storm in RI worse than last hurricane. My city...,storm in ri wors than last hurrican my cityamp...,1
3260,Green Line derailment in Chicago http://t.co/U...,green line derail in chicago httptcoutbxlcbiuy,1
3261,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issu hazard weather outlook hwo httptcoxrb...,1


In [12]:
df_test.prediction.value_counts()

0    2097
1    1166
Name: prediction, dtype: int64