## Natural Language Processing with Disaster Tweets. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import re

In [2]:
# read train data
df = pd.read_csv('d:\\ml\\nlp\\train.csv')

In [3]:
# clean text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove @user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove non-alphanumeric characters
    #text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [4]:
df['cleaned_text'] = df['text'].apply(preprocess_text)
df

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,the out of control wild fires in california ...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1.94 [01:04 utc]?5km s of volcano hawaii.
7611,10872,,,Police investigating after an e-bike collided ...,1,police investigating after an e-bike collided ...


In [5]:
# split data
x_train, x_test, y_train, y_test = train_test_split(df.cleaned_text, df.target, random_state=42)

In [6]:
print(df.shape)
print(x_train.shape)
print(x_test.shape)

(7613, 6)
(5709,)
(1904,)


In [7]:
df['cleaned_text'].head().tolist()

['our deeds are the reason of this #earthquake may allah forgive us all',
 'forest fire near la ronge sask. canada',
 "all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in california ',
 'just got sent this photo from ruby #alaska as smoke from #wildfires pours into a school ']

In [8]:
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train) # bow -- bag of words 

In [10]:
list(vec.vocabulary_.items())[:20]

[('this', 11159),
 ('makes', 6875),
 ('sense', 9865),
 ('paper', 8178),
 ('beats', 1402),
 ('rock', 9455),
 ('comes', 2565),
 ('from', 4647),
 ('wood', 12307),
 ('so', 10277),
 ('should', 10028),
 ('be', 1385),
 ('able', 514),
 ('to', 11274),
 ('support', 10802),
 ('and', 853),
 ('obliterate', 7833),
 ('the', 11103),
 ('causes', 2172),
 ('of', 7865)]

In [11]:
clf = LogisticRegression(random_state=42, solver='liblinear')
clf.fit(bow, y_train)

In [12]:
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85      1171
           1       0.74      0.82      0.77       733

    accuracy                           0.82      1904
   macro avg       0.81      0.82      0.81      1904
weighted avg       0.82      0.82      0.82      1904



In [13]:
# загружааем тестовые данные 
dt = pd.read_csv('d:\\ml\\nlp\\test.csv')

In [14]:
dt['cleaned_text'] = dt['text'].apply(preprocess_text)
dt

Unnamed: 0,id,keyword,location,text,cleaned_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting. #spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles ûò safety faste...
3259,10865,,,Storm in RI worse than last hurricane. My city...,storm in ri worse than last hurricane. my city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,green line derailment in chicago
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issues hazardous weather outlook (hwo)


In [15]:
X_test = dt.cleaned_text

In [16]:
# Make predictions on the test set
y_test_pred = clf.predict(vec.transform(X_test))

In [17]:
len (y_test_pred)

3263

In [18]:
submission = pd.DataFrame({'id': dt['id'], 'target': y_test_pred})

# Save to CSV
submission.to_csv('tweet_nlp.csv', index=False)