In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
import re

In [5]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        
        text = re.sub(r"http\S+", "",text)
        text = re.sub(r'<.*?>','', text)
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts

In [7]:
train.text = normalize_texts(train.text)

In [9]:
train.text

0       our deeds are the reason of this  earthquake m...
1                  forest fire near la ronge sask  canada
2       all residents asked to  shelter in place  are ...
3       1 000 people receive  wildfires evacuation ord...
4       just got sent this photo from ruby  alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609     ariaahrary  thetawniest the out of control wi...
7610              m1   01 0 utc  km s of volcano hawaii  
7611    police investigating after an e bike collided ...
7612    the latest  more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [10]:
test.text = normalize_texts(test.text)

### Train test split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train.text, train.target, test_size=0.2, random_state=42, stratify=train.target)

In [26]:
X_train

6234    sassy city girl country hunk stranded in smoky...
326     god s kingdom  heavenly gov t  will rule over ...
997     mopheme and bigstar johnson are a problem in t...
7269             vixmeldrew sounds like a whirlwind life 
2189    malaysia confirms plane debris washed up on re...
                              ...                        
3386     a voluntary evacuation is being recommended a...
3280    rt  calestous  tanzania elephant population de...
305      pbban  temporary 00  russaky   armageddon   d...
1648    petition   heartless owner that whipped horse ...
7569     marynmck that s beyond adorable  i hope it wo...
Name: text, Length: 6090, dtype: object

### TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf = TfidfVectorizer()

In [15]:
tfidf_train_vector = tfidf.fit_transform(X_train)

In [16]:
tfidf_test_vector = tfidf.transform(X_test)

In [34]:
tfidf_train_vector

<6090x14341 sparse matrix of type '<class 'numpy.float64'>'
	with 77782 stored elements in Compressed Sparse Row format>

## Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
NB = MultinomialNB()

In [20]:
model = NB.fit(tfidf_train_vector,y_train)

In [21]:
y_pred = model.predict(tfidf_test_vector)

In [22]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.86       869
           1       0.88      0.68      0.77       654

    accuracy                           0.82      1523
   macro avg       0.84      0.80      0.81      1523
weighted avg       0.83      0.82      0.82      1523



In [24]:
cm = confusion_matrix(y_test, y_pred)

In [25]:
cm

array([[807,  62],
       [209, 445]], dtype=int64)

In [30]:
test.text

0                      just happened a terrible car crash
1       heard about  earthquake is different cities  s...
2       there is a forest fire at spot pond  geese are...
3                apocalypse lighting   spokane  wildfires
4             typhoon soudelor kills  in china and taiwan
                              ...                        
3258    earthquake safety los angeles   safety fastene...
3259    storm in ri worse than last hurricane  my city...
3260                    green line derailment in chicago 
3261          meg issues hazardous weather outlook  hwo  
3262     cityofcalgary has activated its municipal eme...
Name: text, Length: 3263, dtype: object

In [31]:
tfidf_test = tfidf.transform(test.text)

In [33]:
tfidf_test

<3263x14341 sparse matrix of type '<class 'numpy.float64'>'
	with 37004 stored elements in Compressed Sparse Row format>

In [35]:
submission = pd.read_csv('sample_submission.csv')

In [36]:
test_pred = model.predict(tfidf_test)

In [37]:

submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)