In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df['target'].value_counts()


0    4342
1    3271
Name: target, dtype: int64

In [6]:
df['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
damage                   41
harm                     41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [7]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
df.fillna('',inplace=True)

## Removing punctuation

In [9]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree


In [11]:
df['text']= df['text'].apply(lambda x:remove_punctuation(x))

In [12]:
df['keyword']= df['keyword'].apply(lambda x:remove_punctuation(x))

## Lowering the case

In [13]:
df['text']= df['text'].apply(lambda x: x.lower())

In [14]:
df['keyword']= df['keyword'].apply(lambda x: x.lower())

## Tokenizing 

In [15]:
#defining function for tokenization

def tokenize(string):
    tokens = string.split()
    return tokens
#applying function to the column


In [16]:
df['text']= df['text'].apply(lambda x: tokenize(x))

In [17]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[13000, people, receive, wildfires, evacuation...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [18]:
df['keyword']= df['keyword'].apply(lambda x: tokenize(x))

In [19]:
df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,[],,"[two, giant, cranes, holding, a, bridge, colla...",1
7609,10870,[],,"[ariaahrary, thetawniest, the, out, of, contro...",1
7610,10871,[],,"[m194, 0104, utc5km, s, of, volcano, hawaii, h...",1
7611,10872,[],,"[police, investigating, after, an, ebike, coll...",1
7612,10873,[],,"[the, latest, more, homes, razed, by, northern...",1


In [20]:
df.drop(columns=['id'],inplace=True)

In [21]:
df.head()

Unnamed: 0,keyword,location,text,target
0,[],,"[our, deeds, are, the, reason, of, this, earth...",1
1,[],,"[forest, fire, near, la, ronge, sask, canada]",1
2,[],,"[all, residents, asked, to, shelter, in, place...",1
3,[],,"[13000, people, receive, wildfires, evacuation...",1
4,[],,"[just, got, sent, this, photo, from, ruby, ala...",1


## Removing stop words

In [66]:
count = 0
for i in df['keyword']:
    if i != []:
        count+=1
count

7552

In [22]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

In [23]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [24]:
df['text']= df['text'].apply(lambda x:remove_stopwords(x))

## Stemming

In [25]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [26]:
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
df['text']=df['text'].apply(lambda x: stemming(x))

In [27]:
df['keyword']=df['keyword'].apply(lambda x: stemming(x))

## Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [29]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Manas
[nltk_data]     Vardhan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [32]:
# df['keyword']=df['keyword'].apply(lambda x: lemmatizer(x))
# df['text']=df['text'].apply(lambda x: stemming(x))

## Vectorizing the data

### Building the vocabulary

In [43]:
vocab = []

# for i in df['text'].values()
for i in df['text'].values:
    vocab = vocab + i

print(len(vocab))


76378


In [45]:
set_vocab = set(vocab)
# len(set_vocab)
vocab = list(set_vocab)
print(len(vocab),type(vocab))

19502 <class 'list'>


In [72]:
keyword_list = []
for i in df['keyword'].values:
    keyword_list = keyword_list + i
print(len(keyword_list))

set_key_list = set(keyword_list)

len(set_key_list)

7552


166

## Vectorizing the text data

In [77]:
df['text_strings'] = df['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [78]:
df['text_strings'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3    13000 peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text_strings, dtype: object

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
vectorizer = CountVectorizer()

In [81]:
X = vectorizer.fit_transform(df['text_strings'])

In [91]:
x_train = X.toarray()

## The text is now vectorized

In [92]:
import numpy as np

In [93]:
import numpy as nper
x_train = np.array(x_train)

In [94]:
y_train = df['target']

In [95]:
x_train.shape

(7613, 19282)

In [96]:
y_train.shape

(7613,)

## Performing Binary Classification now

In [97]:
from sklearn.linear_model import LogisticRegression

In [98]:
clf = LogisticRegression(random_state=42)

In [99]:
clf.fit(x_train,y_train)

LogisticRegression(random_state=42)

In [100]:
pred = clf.predict(x_train)

In [102]:
from sklearn.metrics import accuracy_score

In [103]:
accuracy_score(y_train, pred)

0.9626953894653881

## Testing the model on test set

## Preprocessing the test set

In [104]:
df_test =pd.read_csv('test.csv')

In [105]:
df_test.fillna('',inplace=True)

In [107]:
df_test.drop(columns=['id','keyword','location'],inplace=True)

In [108]:
df_test['text']= df_test['text'].apply(lambda x:remove_punctuation(x))
df_test['text']= df_test['text'].apply(lambda x: tokenize(x))
df_test['text']= df_test['text'].apply(lambda x:remove_stopwords(x))
df_test['text']= df_test['text'].apply(lambda x: stemming(x))

In [109]:
df_test['text_strings'] = df_test['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [110]:
x_test = vectorizer.transform(df_test['text_strings'])

In [111]:
x_test = x_test.toarray()

In [112]:
x_test = np.array(x_test)

In [114]:
y_test_pred = clf.predict(x_test)

In [116]:
y_test_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

## Putting the predictions to test csv

In [117]:
import pandas as pd

In [118]:
submission = pd.read_csv('test.csv')

In [121]:
submission['target'] = y_test_pred

In [122]:
submission.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [124]:
final_submission = submission[['id','target']]

In [125]:
final_submission.to_csv('final_submission.csv')