In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.pipeline import Pipeline
warnings.filterwarnings("ignore")


In [2]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")

In [3]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df_train.isnull().sum(), df_test.isnull().sum() 

(id             0
 keyword       61
 location    2533
 text           0
 target         0
 dtype: int64,
 id             0
 keyword       26
 location    1105
 text           0
 dtype: int64)

In [7]:
df_train.drop(['location','keyword','id'],axis=1,inplace=True)
df_test.drop(['location','keyword','id'],axis=1,inplace=True)

In [8]:
df_train

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [9]:
df_test

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


#### First level of text cleanup removing special characters and unwanted symbols and lowering every word using Regex.

In [10]:
def remove_symbols_low(dfcolumns):
    dfcolumns = dfcolumns.str.replace(r'\W',' ')
    dfcolumns = dfcolumns.str.lower()
    return dfcolumns

In [11]:
df_test.text = remove_symbols_low(df_test.text)
df_test

Unnamed: 0,text
0,just happened a terrible car crash
1,heard about earthquake is different cities s...
2,there is a forest fire at spot pond geese are...
3,apocalypse lighting spokane wildfires
4,typhoon soudelor kills 28 in china and taiwan
...,...
3258,earthquake safety los angeles ûò safety faste...
3259,storm in ri worse than last hurricane my city...
3260,green line derailment in chicago http t co u...
3261,meg issues hazardous weather outlook hwo htt...


In [12]:
df_train.text = remove_symbols_low(df_train.text)
df_train

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake m...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are ...,1
3,13 000 people receive wildfires evacuation or...,1
4,just got sent this photo from ruby alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,aria_ahrary thetawniest the out of control w...,1
7610,m1 94 01 04 utc 5km s of volcano hawaii htt...,1
7611,police investigating after an e bike collided ...,1


In [13]:
df_train.loc[5 ,"text"]

' rockyfire update    california hwy  20 closed in both directions due to lake county fire    cafire  wildfires'

In [14]:
def rem_non(data):
    data = re.sub('\n',' ',data)
    data = re.sub('\t',' ',data)
    data = re.sub('https?://\S+|www\.\S+',' ',data)
    return data
remnon = lambda x: rem_non(x)

In [15]:
df_test.text = df_test.text.apply(remnon)
df_train.text =df_train.text.apply(remnon)

In [16]:
df_train.loc[5 ,"text"]

' rockyfire update    california hwy  20 closed in both directions due to lake county fire    cafire  wildfires'

#### Second level of cleaning

###### Cleaning digits 

In [17]:
def num_remove(data):
    data = re.sub('[^a-zA-Z]',' ',data)
    data = data.split()
    return data
df_test.text = df_test.text.apply(num_remove)
df_test

Unnamed: 0,text
0,"[just, happened, a, terrible, car, crash]"
1,"[heard, about, earthquake, is, different, citi..."
2,"[there, is, a, forest, fire, at, spot, pond, g..."
3,"[apocalypse, lighting, spokane, wildfires]"
4,"[typhoon, soudelor, kills, in, china, and, tai..."
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, in, ri, worse, than, last, hurricane, ..."
3260,"[green, line, derailment, in, chicago, http, t..."
3261,"[meg, issues, hazardous, weather, outlook, hwo..."


In [18]:
#apply it on train test aswell
df_train['text'] = df_train['text'].apply(num_remove)
df_train

Unnamed: 0,text,target
0,"[our, deeds, are, the, reason, of, this, earth...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[all, residents, asked, to, shelter, in, place...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[just, got, sent, this, photo, from, ruby, ala...",1
...,...,...
7608,"[two, giant, cranes, holding, a, bridge, colla...",1
7609,"[aria, ahrary, thetawniest, the, out, of, cont...",1
7610,"[m, utc, km, s, of, volcano, hawaii, http, t, ...",1
7611,"[police, investigating, after, an, e, bike, co...",1


In [19]:
df_train.loc[5 ,"text"]

['rockyfire',
 'update',
 'california',
 'hwy',
 'closed',
 'in',
 'both',
 'directions',
 'due',
 'to',
 'lake',
 'county',
 'fire',
 'cafire',
 'wildfires']

###### through the above method we have tokenize the sentences too.

### importing important specific natural language libraries

In [20]:
import nltk
import gensim
# import normalise
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer ,CountVectorizer

In [21]:
#Calling instances of each APIs.
stem = PorterStemmer()
Wordnetlem = WordNetLemmatizer()
countVect = CountVectorizer()
tfidfVect = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))

In [22]:
# Removing stop words
sw = stopwords.words('english')
wordsetexc = ['asap','gonna','wanna','bro','lit','more','most','far','cheeky',
              'affair','uff','ceeya','on','in','and','wow','whoah',
              'wooww','astounding','no','yes','km','mile','god','allah','jesus','jesuschrist',
             'oh','oh god','etc','damn','needless','fine','http','www','http','website','dollar','co',
             'ltd','unit','union','got','heard','nope','little','us','we','our','almighty',
             'might','ought','buy','sell','sent','invite']
for i in wordsetexc:
    sw.append(i)
def remove_sw(data):
    data = [word for word in data if word not in sw]
    return data

In [23]:
df_test.text = df_test.text.apply(lambda x: remove_sw(x))
df_train.text = df_train.text.apply(lambda x: remove_sw(x))

In [24]:
df_test

Unnamed: 0,text
0,"[happened, terrible, car, crash]"
1,"[earthquake, different, cities, stay, safe, ev..."
2,"[forest, fire, spot, pond, geese, fleeing, acr..."
3,"[apocalypse, lighting, spokane, wildfires]"
4,"[typhoon, soudelor, kills, china, taiwan]"
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, ri, worse, last, hurricane, city, amp,..."
3260,"[green, line, derailment, chicago, utbxlcbiuy]"
3261,"[meg, issues, hazardous, weather, outlook, hwo..."


In [25]:
df_train

Unnamed: 0,text,target
0,"[deeds, reason, earthquake, may, forgive]",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[residents, asked, shelter, place, notified, o...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[photo, ruby, alaska, smoke, wildfires, pours,...",1
...,...,...
7608,"[two, giant, cranes, holding, bridge, collapse...",1
7609,"[aria, ahrary, thetawniest, control, wild, fir...",1
7610,"[utc, volcano, hawaii, zdtoyd, ebj]",1
7611,"[police, investigating, e, bike, collided, car...",1


#### Now moving into some intermediate level of feature engineering part in NLP.

#### Lemmatization of words to convert it into its root words.

In [26]:
def lemmatize(data):
#     data = [stem.stem(word) for word in data]
    data = [Wordnetlem.lemmatize(word) for word in data]
    return data

In [27]:
df_train["text"] = df_train["text"].apply(lambda x: lemmatize(x))

In [28]:
df_train

Unnamed: 0,text,target
0,"[deed, reason, earthquake, may, forgive]",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[resident, asked, shelter, place, notified, of...",1
3,"[people, receive, wildfire, evacuation, order,...",1
4,"[photo, ruby, alaska, smoke, wildfire, pours, ...",1
...,...,...
7608,"[two, giant, crane, holding, bridge, collapse,...",1
7609,"[aria, ahrary, thetawniest, control, wild, fir...",1
7610,"[utc, volcano, hawaii, zdtoyd, ebj]",1
7611,"[police, investigating, e, bike, collided, car...",1


In [29]:
df_test["text"] = df_test["text"].apply(lambda x: lemmatize(x))
df_test

Unnamed: 0,text
0,"[happened, terrible, car, crash]"
1,"[earthquake, different, city, stay, safe, ever..."
2,"[forest, fire, spot, pond, goose, fleeing, acr..."
3,"[apocalypse, lighting, spokane, wildfire]"
4,"[typhoon, soudelor, kill, china, taiwan]"
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, ri, worse, last, hurricane, city, amp,..."
3260,"[green, line, derailment, chicago, utbxlcbiuy]"
3261,"[meg, issue, hazardous, weather, outlook, hwo,..."


#### Tokenizing words in the data set

### Creating a data processing check point

In [30]:
traindata = df_train.copy()
print(traindata.head(7))
testdata = df_test.copy()
print(testdata.head(5))

                                                text  target
0           [deed, reason, earthquake, may, forgive]       1
1      [forest, fire, near, la, ronge, sask, canada]       1
2  [resident, asked, shelter, place, notified, of...       1
3  [people, receive, wildfire, evacuation, order,...       1
4  [photo, ruby, alaska, smoke, wildfire, pours, ...       1
5  [rockyfire, update, california, hwy, closed, d...       1
6  [flood, disaster, heavy, rain, cause, flash, f...       1
                                                text
0                   [happened, terrible, car, crash]
1  [earthquake, different, city, stay, safe, ever...
2  [forest, fire, spot, pond, goose, fleeing, acr...
3          [apocalypse, lighting, spokane, wildfire]
4           [typhoon, soudelor, kill, china, taiwan]


In [31]:
def joinwords(data):
    combined_text = ' '.join(data)
    return combined_text
combined = lambda x: joinwords(x)

In [32]:
traindata["text"] = traindata["text"].apply(combined)

In [33]:
testdata["text"] = testdata["text"].apply(combined)

In [34]:
traindata

Unnamed: 0,text,target
0,deed reason earthquake may forgive,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,people receive wildfire evacuation order calif...,1
4,photo ruby alaska smoke wildfire pours school,1
...,...,...
7608,two giant crane holding bridge collapse nearby...,1
7609,aria ahrary thetawniest control wild fire cali...,1
7610,utc volcano hawaii zdtoyd ebj,1
7611,police investigating e bike collided car portu...,1


In [35]:
testdata

Unnamed: 0,text
0,happened terrible car crash
1,earthquake different city stay safe everyone
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill china taiwan
...,...
3258,earthquake safety los angeles safety fastener ...
3259,storm ri worse last hurricane city amp others ...
3260,green line derailment chicago utbxlcbiuy
3261,meg issue hazardous weather outlook hwo x rbqjhn


### Converting text to Vectors /// Embedding.

##### seperating features and Target label.

In [36]:
X = traindata["text"]
X

0                      deed reason earthquake may forgive
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4           photo ruby alaska smoke wildfire pours school
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    aria ahrary thetawniest control wild fire cali...
7610                        utc volcano hawaii zdtoyd ebj
7611    police investigating e bike collided car portu...
7612    latest home razed northern california wildfire...
Name: text, Length: 7613, dtype: object

In [37]:
y = df_train.target
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

##### Introducing testing set as X-test.

In [38]:
X_test = testdata.text
X_test

0                             happened terrible car crash
1            earthquake different city stay safe everyone
2       forest fire spot pond goose fleeing across str...
3                    apocalypse lighting spokane wildfire
4                      typhoon soudelor kill china taiwan
                              ...                        
3258    earthquake safety los angeles safety fastener ...
3259    storm ri worse last hurricane city amp others ...
3260             green line derailment chicago utbxlcbiuy
3261     meg issue hazardous weather outlook hwo x rbqjhn
3262    cityofcalgary activated municipal emergency pl...
Name: text, Length: 3263, dtype: object

#### importing Sklearn Model_selection API

In [39]:
from sklearn.model_selection import train_test_split ,cross_val_score

In [40]:
# We can intially use train_test_split method.
X_train, X_val, y_train, y_val = train_test_split(X,y ,test_size=0.3)

#### Model with Naive-Bayes MultiNomial NB its the most basic ML algo approach for this sort of task.


In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [42]:
multNB = MultinomialNB()

#### Initially we can deploy a Pipeline and if further model improvemnt is needed we can move onto some specific stepwise precision approach.

In [43]:
model = Pipeline([
     ('vectorizer',TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))),
     ('classifier',MultinomialNB())
])

In [44]:
model.fit(X_train,y_train)
score = model.score(X_val,y_val)
score

0.7850262697022767

In [45]:
pred = model.predict(X_val)

In [46]:
pred

array([0, 0, 0, ..., 0, 0, 0])

In [47]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score

In [48]:
confusion_matrix(pred,y_val)

array([[1197,  389],
       [ 102,  596]])

In [49]:
f1_score(pred,y_val)

0.7082590612002376

### As the Accuracy is bit low we can move on to threshold adjustment or AUC_ROC correwction method

In [50]:
predproba = model.predict_proba(X_val)     #[:,1]
predproba

array([[0.68375805, 0.31624195],
       [0.83032429, 0.16967571],
       [0.72153606, 0.27846394],
       ...,
       [0.68324399, 0.31675601],
       [0.92938266, 0.07061734],
       [0.79938182, 0.20061818]])

In [51]:
threshold = model.predict_proba(X_val)[:,1]
threshold

array([0.31624195, 0.16967571, 0.27846394, ..., 0.31675601, 0.07061734,
       0.20061818])

###### reviewing accuracy

In [52]:
accuracy_ls = []
for thres in threshold:
    y_pred_review = np.where(threshold > thres ,1,0)
    score = accuracy_score(y_pred_review,y_val ,normalize=True)
    accuracy_ls.append(score)
thres_series = pd.Series(threshold)
accuracy_series = pd.Series(accuracy_ls)
fin_accuracy = pd.concat((thres_series,accuracy_series) ,axis=1)
fin_accuracy.columns = ['threshold','accuracy']
fin_accuracy.sort_values(by='accuracy',ascending=False,inplace=True)

In [53]:
fin_accuracy

Unnamed: 0,threshold,accuracy
507,0.451268,0.794221
421,0.454141,0.793783
1798,0.455756,0.793783
138,0.448980,0.793783
1736,0.450838,0.793783
...,...,...
2030,0.009020,0.431699
258,0.008440,0.431261
545,0.008440,0.431261
1941,0.008376,0.431261


#### Final Adjusted prediction

In [54]:
final_adj_pred = np.where(threshold > 0.492724 ,1,0)
final_adj_pred

array([0, 0, 0, ..., 0, 0, 0])

In [55]:
confusion_matrix(final_adj_pred,y_val)

array([[1191,  385],
       [ 108,  600]])

In [56]:
f1_score(final_adj_pred,y_val)

0.7088009450679268

In [57]:
scoreadj = accuracy_score(final_adj_pred,y_val)
scoreadj

0.7841506129597198

##### Eventhough its only a slight improvement in the score its satisfialbe for prediction.

In [58]:
final_test_data_pred = model.predict_proba(X_test)[:,1]
fin_adj_pred = (final_test_data_pred > 0.4927274 ).astype(int)
fin_adj_pred

array([1, 0, 1, ..., 1, 1, 1])

### Creating a submission CSV file

In [59]:
dfsubmtest = pd.read_csv("../input/nlp-getting-started/test.csv")

In [60]:
submission =pd.DataFrame({'id':dfsubmtest.id,
                         'target':fin_adj_pred})
submission.set_index('id',inplace=True)

In [61]:
submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1
9,1
11,1
...,...
10861,1
10865,0
10868,1
10874,1


In [62]:
# submission.to_csv("submission.csv")

### word2vec model and Neural network model for prediction.



In [63]:
token_train = traindata["text"].apply(lambda x: x.split())
token_train


0                [deed, reason, earthquake, may, forgive]
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, asked, shelter, place, notified, of...
3       [people, receive, wildfire, evacuation, order,...
4       [photo, ruby, alaska, smoke, wildfire, pours, ...
                              ...                        
7608    [two, giant, crane, holding, bridge, collapse,...
7609    [aria, ahrary, thetawniest, control, wild, fir...
7610                  [utc, volcano, hawaii, zdtoyd, ebj]
7611    [police, investigating, e, bike, collided, car...
7612    [latest, home, razed, northern, california, wi...
Name: text, Length: 7613, dtype: object

In [64]:
token_test = testdata['text'].apply(lambda x: x.split())
token_test.head(5)

0                     [happened, terrible, car, crash]
1    [earthquake, different, city, stay, safe, ever...
2    [forest, fire, spot, pond, goose, fleeing, acr...
3            [apocalypse, lighting, spokane, wildfire]
4             [typhoon, soudelor, kill, china, taiwan]
Name: text, dtype: object

In [65]:
token_tr = pd.DataFrame(token_train)
token_tr

Unnamed: 0,text
0,"[deed, reason, earthquake, may, forgive]"
1,"[forest, fire, near, la, ronge, sask, canada]"
2,"[resident, asked, shelter, place, notified, of..."
3,"[people, receive, wildfire, evacuation, order,..."
4,"[photo, ruby, alaska, smoke, wildfire, pours, ..."
...,...
7608,"[two, giant, crane, holding, bridge, collapse,..."
7609,"[aria, ahrary, thetawniest, control, wild, fir..."
7610,"[utc, volcano, hawaii, zdtoyd, ebj]"
7611,"[police, investigating, e, bike, collided, car..."


In [66]:
token_ts = pd.DataFrame(token_test)
token_ts

Unnamed: 0,text
0,"[happened, terrible, car, crash]"
1,"[earthquake, different, city, stay, safe, ever..."
2,"[forest, fire, spot, pond, goose, fleeing, acr..."
3,"[apocalypse, lighting, spokane, wildfire]"
4,"[typhoon, soudelor, kill, china, taiwan]"
...,...
3258,"[earthquake, safety, los, angeles, safety, fas..."
3259,"[storm, ri, worse, last, hurricane, city, amp,..."
3260,"[green, line, derailment, chicago, utbxlcbiuy]"
3261,"[meg, issue, hazardous, weather, outlook, hwo,..."


In [67]:
combined_set = pd.concat((token_tr,token_ts) ,axis = 0,ignore_index=True)

In [68]:
len(token_ts) + len(token_tr) == len(combined_set)

True

In [69]:
len(combined_set)


10876

In [70]:
combined_set.iloc[:7619 ,:]


Unnamed: 0,text
0,"[deed, reason, earthquake, may, forgive]"
1,"[forest, fire, near, la, ronge, sask, canada]"
2,"[resident, asked, shelter, place, notified, of..."
3,"[people, receive, wildfire, evacuation, order,..."
4,"[photo, ruby, alaska, smoke, wildfire, pours, ..."
...,...
7614,"[earthquake, different, city, stay, safe, ever..."
7615,"[forest, fire, spot, pond, goose, fleeing, acr..."
7616,"[apocalypse, lighting, spokane, wildfire]"
7617,"[typhoon, soudelor, kill, china, taiwan]"


##### Next applying word2vec model

In [71]:
combined = combined_set.text

In [72]:
combined

0                 [deed, reason, earthquake, may, forgive]
1            [forest, fire, near, la, ronge, sask, canada]
2        [resident, asked, shelter, place, notified, of...
3        [people, receive, wildfire, evacuation, order,...
4        [photo, ruby, alaska, smoke, wildfire, pours, ...
                               ...                        
10871    [earthquake, safety, los, angeles, safety, fas...
10872    [storm, ri, worse, last, hurricane, city, amp,...
10873       [green, line, derailment, chicago, utbxlcbiuy]
10874    [meg, issue, hazardous, weather, outlook, hwo,...
10875    [cityofcalgary, activated, municipal, emergenc...
Name: text, Length: 10876, dtype: object

In [73]:
model_tr = Word2Vec(combined ,size = 200 ,window =20)
model_tr

<gensim.models.word2vec.Word2Vec at 0x7f83dfd5d910>

In [74]:
model_tr.train(combined,total_examples=len(combined_set),epochs=50)

(4024528, 5608850)

In [75]:
model_tr.wv.most_similar(positive='fire')

[('apartment', 0.4911807179450989),
 ('acre', 0.48330608010292053),
 ('grove', 0.4707435369491577),
 ('rockyfire', 0.46481937170028687),
 ('truck', 0.4634692966938019),
 ('nh', 0.4622674584388733),
 ('contained', 0.4506085216999054),
 ('firefighter', 0.4473021328449249),
 ('y', 0.42899906635284424),
 ('reno', 0.42178019881248474)]

In [76]:
def word_vec(modelvec,token, size):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in token:
        try:
            vec += modelvec[word].reshape((1,size))
            count += 1
        except KeyError:#handling the cases where word is not in vocabulary
            
            continue
            
    if count != 0:
        vec /= count
    return vec

In [77]:
def crt_array(vecmodel,token):
    vec_array = np.zeros((len(token),200))
    for i in range(len(token)):
        vec_array[i,:] = word_vec(model_tr,token[i],200)
    return pd.DataFrame(vec_array)
combi_vec_df = crt_array(model_tr,combined)
combi_vec_df.head(6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.551448,-0.003809,-0.532317,1.049268,-0.209272,-0.046728,0.003296,-0.681873,0.402447,-0.248552,...,-0.077578,-0.6603,-0.697649,-0.414005,-0.011635,-0.133113,0.528074,0.203763,0.058233,0.152511
1,0.380703,0.146932,-0.67699,0.69718,-0.191326,-0.795291,-0.073828,-0.022798,0.146404,-0.351722,...,-0.24873,0.936092,-0.049848,0.22363,0.486739,-0.323947,0.403675,0.370061,0.001408,0.433913
2,0.056824,-0.039017,-0.009687,0.513748,0.085952,-0.055658,-0.575753,-0.191573,0.074893,-0.363949,...,0.061505,-0.082326,-0.020939,-0.121216,0.395027,0.183337,-0.06661,-0.073608,0.199782,0.07007
3,0.018016,-0.691554,-0.266602,0.242875,0.437788,-0.002795,-0.888622,-0.040678,-0.397648,-0.377628,...,0.24646,0.733048,0.069134,0.179803,0.584988,-0.214652,0.151292,-0.26506,-0.621677,0.544981
4,0.226495,0.072964,-0.369295,0.07758,-0.301654,-0.357406,-0.219053,-0.949747,0.114092,-0.135188,...,0.518394,0.620391,0.052041,-0.260785,0.412637,-0.066304,0.150814,0.908549,0.085849,0.84893
5,0.186471,-0.102612,-0.62424,0.277194,-0.080588,-0.50531,-0.565652,-0.31429,0.112812,-0.223657,...,0.08991,0.853435,-0.043592,-0.19654,0.388729,-0.17596,0.385507,0.259448,-0.125765,0.534994


In [78]:
len(combi_vec_df) == len(combined_set)

True

In [79]:
X_set = combi_vec_df.iloc[:7613,:]
X_set

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.551448,-0.003809,-0.532317,1.049268,-0.209272,-0.046728,0.003296,-0.681873,0.402447,-0.248552,...,-0.077578,-0.660300,-0.697649,-0.414005,-0.011635,-0.133113,0.528074,0.203763,0.058233,0.152511
1,0.380703,0.146932,-0.676990,0.697180,-0.191326,-0.795291,-0.073828,-0.022798,0.146404,-0.351722,...,-0.248730,0.936092,-0.049848,0.223630,0.486739,-0.323947,0.403675,0.370061,0.001408,0.433913
2,0.056824,-0.039017,-0.009687,0.513748,0.085952,-0.055658,-0.575753,-0.191573,0.074893,-0.363949,...,0.061505,-0.082326,-0.020939,-0.121216,0.395027,0.183337,-0.066610,-0.073608,0.199782,0.070070
3,0.018016,-0.691554,-0.266602,0.242875,0.437788,-0.002795,-0.888622,-0.040678,-0.397648,-0.377628,...,0.246460,0.733048,0.069134,0.179803,0.584988,-0.214652,0.151292,-0.265060,-0.621677,0.544981
4,0.226495,0.072964,-0.369295,0.077580,-0.301654,-0.357406,-0.219053,-0.949747,0.114092,-0.135188,...,0.518394,0.620391,0.052041,-0.260785,0.412637,-0.066304,0.150814,0.908549,0.085849,0.848930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.756646,0.336823,-1.375950,1.407096,-1.459896,-1.300628,0.597647,-0.688954,-1.042763,-0.630698,...,0.289807,0.193673,0.761843,0.537015,-0.657426,-0.375814,0.862334,0.091638,-0.341366,0.836314
7609,0.209851,0.030489,-0.897099,0.414619,-0.018108,-0.360418,0.351974,-0.267052,-0.129436,0.144177,...,0.126554,0.726742,0.309633,-0.270716,0.561797,0.267779,0.381187,0.310755,-0.600332,0.285834
7610,-0.785950,0.066653,-0.168872,0.304885,0.105290,-0.260047,-0.292418,-0.867505,0.079486,-0.617568,...,-0.260996,0.074091,-0.613147,0.103605,-0.201183,0.095131,0.824741,-0.352929,-0.165412,0.403051
7611,-0.340479,-0.111773,-0.322075,0.357046,-0.980287,-0.770177,-0.359815,-0.456515,-0.030048,-0.304642,...,-0.590140,-0.040681,-0.056649,-0.005476,0.290753,0.903904,0.122785,-0.879250,-0.091253,0.241349


In [80]:
X_test_new = combi_vec_df.iloc[7613:,:]
X_test_new = X_test_new.reset_index()
X_test_new = X_test_new.drop('index',axis=1)
X_test_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.374241,-0.186295,0.059741,0.541295,-0.734209,-0.551134,-0.494311,-0.091216,-0.006631,0.044688,...,0.101493,-0.726872,-0.267258,0.922473,0.011315,0.630171,-0.764561,-0.815824,-0.342422,0.203298
1,-0.306095,-0.319479,-0.346668,0.653790,0.025278,-0.064576,-0.436462,-0.077877,0.218220,0.405344,...,0.287805,0.130226,0.008259,-0.064075,0.305192,0.082913,-0.303587,-0.727569,-0.138129,-0.217420
2,0.153551,0.154221,-0.838283,0.414655,-0.019505,-0.565909,-0.087815,0.028767,0.058894,-0.031414,...,-0.148354,0.901535,0.376627,0.554987,0.195869,-0.016263,0.619623,0.092698,-0.500181,0.305460
3,-0.365271,0.158651,-0.425171,-0.279986,0.485152,-0.135406,-0.507950,-0.572697,0.235770,-0.213738,...,0.335569,0.619444,0.256091,0.372662,0.242699,-0.381036,-0.098040,-0.072523,-0.538159,0.577369
4,0.662656,-0.420061,-0.379874,0.585516,-0.580210,-0.737138,-0.287645,-1.427698,-0.313027,-0.463885,...,-0.514204,-1.390828,0.287076,0.096575,-0.752741,0.222136,0.021286,-0.238772,-0.848385,-0.116089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0.084091,0.152443,0.166929,0.490567,-0.193986,-0.370804,-0.253940,-0.093995,0.109593,-0.383947,...,0.042527,-0.219164,-0.711273,-0.067130,0.454375,0.671480,0.274053,-0.249760,-0.696814,0.629982
3259,-0.343738,-0.323137,-0.903390,-0.071720,0.050620,-0.067601,-0.808749,-0.362818,-0.299109,0.527978,...,0.113138,0.527201,0.128281,0.196745,0.039041,0.857115,-0.279694,-0.164667,-0.375729,-0.237152
3260,0.951464,-0.282415,-0.688223,0.990391,-1.485964,-1.075376,-1.147953,0.125107,-0.053832,-0.652368,...,0.334302,-0.119791,0.087239,0.323978,0.904037,-0.257896,-0.575878,0.889553,0.278754,0.728326
3261,-0.216163,0.281501,-0.436767,0.572762,-0.705686,0.201328,-0.575105,-0.906678,0.378271,-0.117378,...,-0.124983,-0.133476,-0.163490,0.354506,0.331231,-0.788983,-0.145032,0.637534,0.717948,-0.011222


In [81]:
X_test_new.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
count,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,...,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0,3263.0
mean,-0.061086,0.009537,-0.367141,0.390569,-0.392672,-0.300767,-0.09769,-0.234311,0.107271,0.077724,...,0.070837,-0.045458,0.117192,0.152226,0.02899,0.081914,0.070398,0.05788,-0.122999,0.058421
std,0.355326,0.352846,0.421551,0.403083,0.486744,0.363081,0.440924,0.369211,0.375147,0.337502,...,0.34318,0.476528,0.390135,0.388059,0.471877,0.414874,0.436735,0.461809,0.406672,0.397146
min,-1.975947,-1.451937,-2.786132,-2.45514,-2.81534,-2.435936,-1.967262,-1.889689,-1.442623,-1.693726,...,-1.51707,-2.106555,-2.444367,-1.727897,-1.896526,-1.835967,-2.050282,-2.195122,-2.595948,-2.393183
25%,-0.28101,-0.224427,-0.622765,0.130634,-0.639549,-0.505732,-0.387593,-0.453563,-0.119577,-0.126814,...,-0.130413,-0.331437,-0.129043,-0.075865,-0.260023,-0.171231,-0.204794,-0.233863,-0.362773,-0.193833
50%,-0.061767,-0.009121,-0.344045,0.375538,-0.32451,-0.274589,-0.098214,-0.219782,0.102486,0.075433,...,0.077756,-0.015499,0.105661,0.160719,0.027381,0.087135,0.072688,0.037747,-0.122519,0.043685
75%,0.15769,0.218443,-0.089551,0.629804,-0.064249,-0.067065,0.177795,-0.008056,0.339626,0.297413,...,0.289326,0.257317,0.358975,0.371689,0.317584,0.340458,0.346606,0.344495,0.12415,0.300401
max,1.425941,2.798487,3.065528,2.459918,1.135716,0.998258,1.892991,1.346805,2.221314,1.628942,...,1.560343,1.654397,1.918975,1.999441,2.41855,2.123333,1.688905,2.504392,1.890186,1.568393


In [82]:
Xvec_train ,Xvec_val,yvec_train,yvec_val = train_test_split(X_set,y ,test_size=0.2 ,random_state=42)

In [83]:
train_vec = np.array(Xvec_train)
val_vec =  np.array(Xvec_val)
test_vec = np.array(X_test_new)
ytrain_vec = np.array(yvec_train)
yval_vec = np.array(yvec_val)

#### Creating Neural Network 

##### ModelNN

In [84]:
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten,Embedding,LSTM,Bidirectional,SpatialDropout1D
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.preprocessing.text import one_hot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences

In [85]:
call_backs = [EarlyStopping(monitor='val_loss',patience=8),
              ModelCheckpoint(filepath='best_model.h5',monitor='val_loss',
                             save_best_only=True)]

In [86]:
voc_size = 3

In [87]:
modelNN = Sequential()
modelNN.add(Dense(200,kernel_initializer='he_uniform',activation='relu',input_shape=(200,)))
modelNN.add(Dense(512,kernel_initializer='he_uniform',activation='relu'))
modelNN.add(Dropout(0.2))
modelNN.add(Dense(128,kernel_initializer='he_uniform',activation='relu'))
# modelNN.add(Dropout(0.2))
modelNN.add(Dense(64,kernel_initializer='he_uniform',activation='relu'))
modelNN.add(Dropout(0.2))                                            
modelNN.add(Dense(1,activation='sigmoid'))
modelNN.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [88]:
# modelNN.fit(train_vec , ytrain_vec ,#callbacks=call_backs,
#             batch_size = 200,validation_split=0.33,epochs=2000)

In [89]:
# preds = modelNN.predict(val_vec)
# adj = np.where(preds > 0.5 ,1,0)
# adj

In [90]:
# accuracy_score(adj,yval_vec)

In [91]:
# accuracy_NNls = []
# for pred in preds:
#     y_pred_NNreview = np.where(preds > pred ,1,0)
#     score = f1_score(y_pred_NNreview.reshape(-1,1),yval_vec.reshape(-1,1) )
#     accuracy_NNls.append(score)
# predaccu_series = pd.DataFrame(preds)
# accuracy_series = pd.Series(accuracy_NNls)
# fin_accuracyNN = pd.concat((predaccu_series,accuracy_series) ,axis=1)
# fin_accuracyNN.columns = ['prediction','accuracy']
# fin_accuracyNN.sort_values(by='accuracy',ascending=False,inplace=True)
    

In [92]:
# fin_accuracyNN

In [93]:

# adj = np.where(preds > 0.00010318 ,1,0)
# adj

In [94]:
# flaccuracy = f1_score(adj,yval_vec)
# flaccuracy

In [95]:
# NNtest_pred = modelNN.predict(test_vec_df)
# adj_test = np.where(preds > 0.410577 ,1,0)
# adj_test = adj_test.reshape(-1,1) 
# adj_test
# pd.DataFrame(adj_test)

In [96]:
# dfsubmtest = pd.read_csv("../input/nlp-getting-started/test.csv")
# submission =pd.DataFrame({'id':dfsubmtest.id,
#                          'target':})


In [97]:
# submission.to_csv('submissionNN.csv')

In [98]:
# Trainspar = tfidfVect.fit_transform(X_train)
# val_spar = tfidfVect.transform(X_val)

In [99]:
# Testspar = tfidfVect.transform(X_test)

In [100]:
# arrtrain = Trainspar.toarray()
# valarr = val_spar.toarray()
# arrtest = Testspar.toarray()

In [101]:
# print(arrtrain.shape)
# print(valarr.shape)
# print(arrtest.shape)

In [102]:
# modelNN = Sequential()
# modelNN.add(Dense(1024,kernel_initializer='he_uniform',activation='relu',input_shape=(8386,)))
# modelNN.add(Dense(1024,kernel_initializer='he_uniform',activation='relu'))
# # modelNN.add(Dropout(0.1))
# modelNN.add(Dense(512,kernel_initializer='he_uniform',activation='relu'))
# # modelNN.add(Dropout(0.2))
# modelNN.add(Dense(128,kernel_initializer='he_uniform',activation='relu'))
# # modelNN.add(Dropout(0.2))
# modelNN.add(Dense(64,kernel_initializer='he_uniform',activation='relu'))
# modelNN.add(Dense(1,activation='sigmoid'))
# modelNN.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [103]:
# modelNN.fit(arrtrain , np.array(y_train) ,
#             batch_size = 2048,validation_split=0.33,epochs= 1000)

In [104]:
# prediction_val = modelNN.predict(valarr)
# prediction_ = np.where(prediction_val > 0.72354312232 ,1,0)
# prediction_

In [105]:
# accuracy_score(prediction_,np.array(y_val))

In [106]:
# f1_score(prediction_,np.array(y_val))

In [107]:
# accuracy_NNls = []
# for pred in prediction_val:
#     y_pred_NNreview = np.where(prediction_val > pred ,1,0)
#     score = f1_score(y_pred_NNreview.reshape(-1,1),np.array(y_val).reshape(-1,1) )
#     accuracy_NNls.append(score)
# predaccu_series = pd.DataFrame(prediction_val)
# accuracy_series = pd.Series(accuracy_NNls)
# fin_accuracyNN = pd.concat((predaccu_series,accuracy_series) ,axis=1)
# fin_accuracyNN.columns = ['prediction','accuracy']
# fin_accuracyNN.sort_values(by='accuracy',ascending=False,inplace=True)
    

In [108]:
# fin_accuracyNN

#### Keras Embedding

In [109]:
vocab_size = 512

In [110]:
def onehot(data):
    data = [one_hot(i,vocab_size)for i in data]
    return data
onehotter = lambda x: onehot(x)

In [111]:
X_train_embed = token_train.apply(onehotter)
X_train_embed


0                     [[172], [367], [289], [359], [191]]
1        [[307], [479], [91], [272], [158], [107], [395]]
2       [[215], [409], [95], [500], [389], [166], [10]...
3                [[203], [509], [362], [10], [96], [421]]
4        [[511], [211], [310], [502], [362], [43], [115]]
                              ...                        
7608    [[451], [481], [490], [335], [245], [172], [17...
7609    [[258], [33], [494], [56], [32], [479], [421],...
7610                  [[121], [381], [175], [208], [416]]
7611    [[374], [32], [342], [443], [6], [374], [418],...
7612    [[56], [170], [296], [31], [421], [362], [279]...
Name: text, Length: 7613, dtype: object

In [112]:
X_test_embed = token_test.apply(onehotter)
X_test_embed

0                            [[359], [175], [374], [136]]
1               [[289], [206], [72], [395], [308], [435]]
2       [[307], [479], [388], [201], [143], [332], [35...
3                             [[232], [381], [70], [362]]
4                     [[197], [450], [238], [509], [342]]
                              ...                        
3258     [[289], [369], [352], [129], [369], [52], [184]]
3259    [[67], [194], [461], [71], [225], [72], [468],...
3260                  [[176], [270], [437], [481], [447]]
3261    [[50], [417], [91], [329], [325], [305], [224]...
3262           [[164], [465], [312], [334], [161], [211]]
Name: text, Length: 3263, dtype: object

In [113]:
sent_length = 20
X_train_pad = pad_sequences(X_train_embed ,padding='pre',maxlen=sent_length)
X_train_pad

array([[[  0],
        [  0],
        [  0],
        ...,
        [289],
        [359],
        [191]],

       [[  0],
        [  0],
        [  0],
        ...,
        [158],
        [107],
        [395]],

       [[  0],
        [  0],
        [  0],
        ...,
        [500],
        [ 96],
        [504]],

       ...,

       [[  0],
        [  0],
        [  0],
        ...,
        [175],
        [208],
        [416]],

       [[  0],
        [  0],
        [  0],
        ...,
        [ 59],
        [285],
        [472]],

       [[  0],
        [  0],
        [  0],
        ...,
        [397],
        [162],
        [192]]], dtype=int32)

In [114]:
train_set = X_train_pad[0:6000]
train_set.shape

(6000, 20, 1)

In [115]:
val_set = X_train_pad[6000: ]
val_set.shape

(1613, 20, 1)

In [116]:
y_train_set = y[ :6000]
y_val_set = y[6000: ]
y_train_set

0       1
1       1
2       1
3       1
4       1
       ..
5995    0
5996    0
5997    0
5998    0
5999    0
Name: target, Length: 6000, dtype: int64

In [117]:
y_val_set 

6000    0
6001    0
6002    0
6003    0
6004    0
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 1613, dtype: int64

In [118]:
X_test_pad = pad_sequences(X_test_embed ,padding='pre',maxlen=sent_length)
X_test_pad.shape

(3263, 20, 1)

In [119]:
embedding_layer = Embedding(vocab_size,1024,input_length = sent_length,trainable = True)

In [120]:
call_back = [EarlyStopping(monitor='val_loss',patience=3),
              ModelCheckpoint(filepath='best_model.h5',monitor='val_loss',
                             save_best_only=True)]

In [121]:
modelEM = Sequential()
modelEM.add(embedding_layer)
modelEM.add(Dense(512,activation='relu'))
modelEM.add(Dropout(0.5))
modelEM.add(Dense(512,activation='relu'))
modelEM.add(Dropout(0.3))
modelEM.add(LSTM(512,activation="tanh",
    recurrent_activation="sigmoid",                 
    use_bias=True,
    kernel_initializer="glorot_uniform",
    recurrent_initializer="orthogonal",
    bias_initializer="zeros",
    unit_forget_bias=True,
    kernel_regularizer=None,
    recurrent_regularizer=None,
    bias_regularizer=None,
    activity_regularizer=None,
    kernel_constraint=None,
    recurrent_constraint=None,
    bias_constraint=None,
    dropout=0.2,
    recurrent_dropout=0.1))
modelEM.add(Dense(1,activation="sigmoid"))
modelEM.compile(optimizer ='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

In [122]:
modelEM.fit(train_set,y_train_set,callbacks=call_back,
             batch_size = 200,validation_split=0.33,epochs= 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500


<tensorflow.python.keras.callbacks.History at 0x7f83b612a750>

In [123]:
prediction_em = modelEM.predict(val_set)
prediction_ = np.where(prediction_em > 0.5,1,0)


In [124]:
accuracy_score(prediction_,y_val_set)

0.6466212027278363

In [125]:
f1_score(prediction_,y_val_set)

0.5746268656716419

In [126]:
accuracy_EMls = []
for pred in prediction_em:
    y_pred_EMreview = np.where(prediction_em > pred ,1,0)
    score = f1_score(y_pred_EMreview.reshape(-1,1),np.array(y_val_set).reshape(-1,1) )
    accuracy_EMls.append(score)
predaccu_series = pd.DataFrame(prediction_em)
accuracy_series = pd.Series(accuracy_EMls)
fin_accuracyEM = pd.concat((predaccu_series,accuracy_series) ,axis=1)
fin_accuracyEM.columns = ['prediction','accuracy']
fin_accuracyEM.sort_values(by='accuracy',ascending=False,inplace=True)
    

In [127]:
fin_accuracyEM

Unnamed: 0,prediction,accuracy
1463,0.126013,0.641328
1002,0.125821,0.641048
832,0.110882,0.640970
1367,0.116914,0.640903
185,0.120541,0.640836
...,...,...
1506,0.920390,0.010554
1052,0.922039,0.007926
28,0.932086,0.002649
35,0.932086,0.002649


#### Bidirectional LSTM

In [128]:

bidirect_model = Sequential()
bidirect_model.add(embedding_layer)
bidirect_model.add(SpatialDropout1D(0.2))
bidirect_model.add(Dense(512,activation='relu'))
bidirect_model.add(Dropout(0.3))
bidirect_model.add(Bidirectional(LSTM(128, return_sequences=False,dropout=0.2, recurrent_dropout=0.2)))
bidirect_model.add(Dense(1, activation='sigmoid'))

In [129]:
optimizer = Adam(learning_rate = 3e-4)
bidirect_model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics = ['accuracy'])

In [130]:
bidirect_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 1024)          524288    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 20, 1024)          0         
_________________________________________________________________
dense_8 (Dense)              (None, 20, 512)           524800    
_________________________________________________________________
dropout_4 (Dropout)          (None, 20, 512)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               656384    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 257       
Total params: 1,705,729
Trainable params: 1,705,729
Non-trainable params: 0
____________________________________________

In [131]:
bidirect_model.fit(train_set,y_train_set,callbacks=call_back,
             batch_size = 200,validation_split=0.33,epochs= 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


<tensorflow.python.keras.callbacks.History at 0x7f83737f7810>

In [132]:
bi_pred = bidirect_model.predict(val_set)
bi_pred = np.where(bi_pred > 0.5 ,1,0)
bi_pred

array([[0],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [133]:

accuracy_score(bi_pred,y_val_set)

0.6615003099814011

In [134]:
test_pred = np.where(bidirect_model.predict(X_test_pad) > 0.5 , 1,0)
test_pred

array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [135]:
def submit(modelarch):
    test_pred = np.where(modelarch.predict(X_test_pad) > 0.5 , 1,0)
    target_lst = []
    for i in test_pred:
        if i < 0.5:
            i = 0
        else:
            i = 1
        target_lst.append(i)
    submitset = pd.DataFrame({'id':pd.read_csv('../input/nlp-getting-started/test.csv').id,
                      'target':target_lst})
    submitset.set_index('id',inplace=True)
                            
    return submitset

In [136]:
submissionEMBED = submit(modelEM)
submissionEMBED

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1
9,1
11,1
...,...
10861,0
10865,0
10868,0
10874,1


In [137]:
submissionBID = submit(bidirect_model)
submissionBID

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1
9,1
11,1
...,...
10861,0
10865,1
10868,0
10874,1


In [138]:
submissionEMBED.to_csv('submissionEMBED.csv')


In [139]:
submissionBID.to_csv('submissionBID.csv')