In [100]:
import pandas as pd
import re

In [123]:
train = pd.read_csv("data/Corona_NLP_train.csv", encoding='ISO-8859-1')
test = pd.read_csv("data/Corona_NLP_test.csv", encoding='ISO-8859-1')

In [124]:
df = pd.concat([test, train], ignore_index=True)

In [125]:
df['TweetAt'] = pd.to_datetime(df['TweetAt'], format="%d-%m-%Y")


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44955 entries, 0 to 44954
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   UserName       44955 non-null  int64         
 1   ScreenName     44955 non-null  int64         
 2   Location       35531 non-null  object        
 3   TweetAt        44955 non-null  datetime64[ns]
 4   OriginalTweet  44955 non-null  object        
 5   Sentiment      44955 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 2.1+ MB


In [127]:
df = df.sort_values(by='TweetAt').reset_index(drop=True)

In [128]:
df = df[["TweetAt", "OriginalTweet", "Sentiment"]]

In [129]:
print(str(df["TweetAt"].min()) + "  <-- min date")
print(str(df["TweetAt"].max()) + "  <-- max date")

2020-03-02 00:00:00  <-- min date
2020-04-14 00:00:00  <-- max date


In [130]:
text = df[["TweetAt", "OriginalTweet"]]

In [131]:
text["length"] = text["OriginalTweet"].apply(lambda x: len(x))

In [132]:
mentions = text['OriginalTweet'].str.findall(r'(@\w+)')

In [133]:
text.insert(3,'nbr_mentions', mentions.apply(lambda x : len(x)))

In [134]:
mentions = mentions.explode().value_counts(True)
mentions

@realDonaldTrump    0.014899
@Tesco              0.010376
@sainsburys         0.009418
@BorisJohnson       0.008141
@amazon             0.006385
                      ...   
@Billtony91         0.000053
@BBCEngland         0.000053
@parademag          0.000053
@JohnFMauldin       0.000053
@TartiiCat          0.000053
Name: OriginalTweet, Length: 11163, dtype: float64

In [135]:
def remove_usernames_links(df):
    df = re.sub('#[^\s]+', '', df)
    df = re.sub('@[^\s]+','',df)
    df = re.sub('http[^\s]+','',df) 
    df = re.sub(' +', ' ', df)
    return df

In [136]:
df["text_without_tags"] = df["OriginalTweet"].apply(remove_usernames_links)

In [137]:
df

Unnamed: 0,TweetAt,OriginalTweet,Sentiment,text_without_tags
0,2020-03-02,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,TRENDING: New Yorkers encounter empty supermar...
1,2020-03-02,When I couldn't find hand sanitizer at Fred Me...,Positive,When I couldn't find hand sanitizer at Fred Me...
2,2020-03-02,Find out how you can protect yourself and love...,Extremely Positive,Find out how you can protect yourself and love...
3,2020-03-02,#Panic buying hits #NewYork City as anxious sh...,Negative,buying hits City as anxious shoppers stock up...
4,2020-03-03,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,One week everyone buying baby milk powder the...
...,...,...,...,...
44950,2020-04-14,Breweries are making hand sanitizer over booze...,Positive,Breweries are making hand sanitizer over booze...
44951,2020-04-14,Just scolded my dad who wanted to go supermark...,Extremely Negative,Just scolded my dad who wanted to go supermark...
44952,2020-04-14,COVID-19 Special LIVE Phone-In Program with Sh...,Positive,COVID-19 Special LIVE Phone-In Program with Sh...
44953,2020-04-14,"We may be saying goodbye to paper flyers soon,...",Positive,"We may be saying goodbye to paper flyers soon,..."


In [138]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [139]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rogermauvois/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rogermauvois/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [140]:
def remove_stopwords(text):
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.lower() not in stop_words]
    return " ".join(words_filtered)

In [141]:
df['text_without_tags'] = df['text_without_tags'].apply(lambda x: remove_stopwords(x.lower()))

In [142]:
df

Unnamed: 0,TweetAt,OriginalTweet,Sentiment,text_without_tags
0,2020-03-02,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,trending : new yorkers encounter empty superma...
1,2020-03-02,When I couldn't find hand sanitizer at Fred Me...,Positive,"could n't find hand sanitizer fred meyer , tur..."
2,2020-03-02,Find out how you can protect yourself and love...,Extremely Positive,find protect loved ones ?
3,2020-03-02,#Panic buying hits #NewYork City as anxious sh...,Negative,buying hits city anxious shoppers stock food &...
4,2020-03-03,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,one week everyone buying baby milk powder next...
...,...,...,...,...
44950,2020-04-14,Breweries are making hand sanitizer over booze...,Positive,breweries making hand sanitizer booze help fig...
44951,2020-04-14,Just scolded my dad who wanted to go supermark...,Extremely Negative,scolded dad wanted go supermarket walk walk to...
44952,2020-04-14,COVID-19 Special LIVE Phone-In Program with Sh...,Positive,covid-19 special live phone-in program shri k....
44953,2020-04-14,"We may be saying goodbye to paper flyers soon,...",Positive,"may saying goodbye paper flyers soon , thanks"


### Modèle

In [144]:
sentiment_mapping = {
    'Extremely Negative': -1,
    'Negative': -0.5,
    'Neutral': 0,
    'Positive': 0.5,
    'Extremely Positive': 1
}

In [145]:
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

In [157]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, TextVectorization
from tensorflow.keras import Sequential


In [147]:
from sklearn.model_selection import train_test_split

In [148]:
X = df["text_without_tags"].values
y = df["Sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [149]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.30, random_state=42)

In [150]:
X_train.shape, X_val.shape, X_test.shape

((26747,), (11464,), (6744,))

In [155]:
import numpy as np

In [159]:
embedding_dim = 100
max_tokens = 20000
embedding_matrix = np.zeros((max_tokens, embedding_dim))

In [160]:
model = Sequential()

model.add(TextVectorization(max_tokens = max_tokens, output_sequence_length = 50, name = "text_vectorizer"))
model.add(Embedding(
    max_tokens,
    embedding_dim
    # embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # trainable=False, 
    # mask_zero=True,
    )
)
model.add(LSTM(100))
model.add(Dense(1, activation="tanh"))

model.compile(optimizer="adam", loss= "")

model.summary()



ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.

In [None]:
text_only_train_ds = train_ds.map(lambda x, y: x)

max_length = 600 # Longueur max des entrées (seul 5% des reviews font plus de 600 mots)
max_tokens = 20000 # Limite le vocabulaire aux 20_000 mots les plus utilisés
text_vectorization = layers.TextVectorization(
 max_tokens=max_tokens,
 output_mode="int", # Les sorties sont des séquences de mots encodées comme des entiers
 output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)