# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [2]:
dataset = pd.read_csv('train.csv')

**Let's Explore Our dataset**

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


We Have 4 columns other than **id**:

- keyword : *Keyword from text (Can be Useful for us)*
- location : *Tells where the tweet is from*
- text: *The tweet*
- target: *whether the tweet is disaster **1** or not **0*** 

In [5]:
dataset.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

- **Keyword** has 61 missing values which is around **0.8%**
- **location** has 2533 missing values which is around **33.27**

*Let's explore the keywords*

In [6]:
dataset['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

we have **221** unique keywords

In [7]:
print('Top 10 keywords')
print(dataset['keyword'].value_counts().head(20))
print('Least 10 keywords')
print(dataset['keyword'].value_counts().tail(20))

Top 10 keywords
keyword
fatalities     45
deluge         42
armageddon     42
sinking        41
damage         41
harm           41
body%20bags    41
outbreak       40
evacuate       40
fear           40
collided       40
siren          40
twister        40
windstorm      40
sinkhole       39
sunk           39
hellfire       39
weapon         39
weapons        39
famine         39
Name: count, dtype: int64
Least 10 keywords
keyword
bombing                  29
obliteration             29
sirens                   29
snowstorm                29
desolate                 29
seismic                  29
first%20responders       29
rubble                   28
demolished               28
deluged                  27
volcano                  27
battle                   26
bush%20fires             25
war%20zone               24
rescue                   22
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name

**Observations**

1. **Fatalities** was the most frequent keyword while **radiation emergency** was the least
2. Keywords contain **%20** -> needs to be handled
3. Some keywords are similar like **sunk/sinking** and **weapon/weapons** -> needs to be handled

*Let's explore the locations*

In [8]:
print('Top 20 locations')
print(dataset['location'].value_counts().head(20))

Top 20 locations
location
USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
Washington, DC      21
Kenya               20
Worldwide           19
Australia           18
Chicago, IL         18
California          17
Everywhere          15
New York, NY        15
California, USA     15
Florida             14
Name: count, dtype: int64


we can see locations like **New York and NY** | **USA and United States** are redundant -> needs to be handled

# Cleaning the data

In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [10]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.remove('not')
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deshm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


we will write a function to clean the text

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r'',text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = ' '.join(ps.stem(word) for word in text.split() if word not in stop_words)
    return text

In [12]:
dataset['cleaned_text'] = dataset['text'].apply(clean_text)

Handling missing values

In [13]:
dataset['keyword'].fillna('None', inplace=True)
dataset['location'].fillna('None', inplace=True)

Replace %20 in keyword Column

In [14]:
dataset['Keyword'] = dataset['keyword'].apply(lambda x: x.replace('%20', ' '))

# we will create a bag of words model

Our training data will be text, keyword, location combined and we will extract features from it

In [15]:
dataset['combined_text'] = dataset['cleaned_text'] + ' ' + dataset['keyword'] + ' ' + dataset['location']

**Creating bag of words model**

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(dataset['combined_text'])
y = dataset['target']

## Splitting the dataset into the Training set and Test set

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Accuracy on Various Models

1. Random Forest Classifier - 78%
2. Logistic Regression - 79%
3. Linear SVM - 79%
4. Naive Bayes - 79%
5. K-Nearest Neighbors (KNN) - 74%
6. Gradient Boosting Classifier - 75%
7. XGBoost Classifier - 77%
8. Kernel SVM - 80%

## Training the kernel svm model on the Training set

In [18]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [19]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy_score(y_test, y_pred))


[[773 101]
 [198 451]]
accuracy:  0.8036769533814839


# Making Submission

In [20]:
test_data = pd.read_csv('test.csv')
test_data['cleaned_text'] = test_data['text'].apply(clean_text)
test_data['keyword'].fillna('', inplace=True)
test_data['Keyword'] = test_data['keyword'].apply(lambda x: x.replace('%20', ' '))
test_data['location'].fillna('', inplace=True)
test_data['combined_text'] = test_data['cleaned_text'] + ' ' + test_data['keyword'] + ' ' + test_data['location']
X_test = vectorizer.transform(test_data['combined_text'])
y_pred = classifier.predict(X_test)

submission = pd.DataFrame({'id': test_data['id'], 'target': y_pred})
submission.to_csv('submission.csv', index=False)

**Kaggle score** : 0.80140