## Import libraries

In [54]:
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec

import pandas as pd

## Read the data

In [41]:
df = pd.read_csv('train_processed.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask place notifi offic evacu shelter pla...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...


## Data Cleaning

In [42]:
df.drop(['id','keyword', 'location'], axis = 1, inplace = True) 

In [43]:
df.isnull().sum()

text              0
target            0
processed_text    4
dtype: int64

In [44]:
df.dropna(inplace=True)

## Save the data to a corpus

In [45]:
text_file_path = 'corpus.txt'
df['text'].to_csv(text_file_path, index=False, header=False)

In [None]:
corpus_file = 'corpus.txt'

## FastText Model

In [52]:
model = FastText(vector_size=100, window=5, min_count=5, workers=4, sg=1)

model.build_vocab(corpus_file=corpus_file)

model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model.wv.most_similar('fire'))
print(model.wv.similarity('fire', 'flood'))

[('fires.', 0.9738218784332275),
 ('fires', 0.970072865486145),
 ('rare', 0.9613994359970093),
 ('fire.', 0.9588819146156311),
 ('Here', 0.9545063376426697),
 ('fatalities', 0.954184353351593),
 ('desires', 0.9512333273887634),
 ('fat', 0.9481849670410156),
 ('entire', 0.9475743770599365),
 ('causes', 0.9471319317817688)]
0.88607633


## Word2Vec Model

In [53]:
model = Word2Vec(vector_size=100, window=5, min_count=5, workers=4, sg=1)

model.build_vocab(corpus_file=corpus_file)

model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model.wv.most_similar('fire'))
print(model.wv.similarity('fire', 'flood'))

[('truck', 0.9356878399848938),
 ('were', 0.931496798992157),
 ('two', 0.9256762266159058),
 ('on', 0.9133358597755432),
 ('people', 0.9114479422569275),
 ('buildings', 0.9094985723495483),
 ('collided', 0.9072275757789612),
 ('death', 0.9035748839378357),
 ('first', 0.9017435312271118),
 ('least', 0.899533748626709)]
0.85636294


## GloVe Model

In [57]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

On remarque que le modele FastText est plus performant que le Word2Vec et le Glove.

## Conclusion

Le FastText est plus performant que le Word2Vec et le GloVe car il prend en compte les sous mots. C'est à dire que si un mot n'est pas dans le corpus, le FastText peut le predire en prenant en compte les sous mots qui le composent.