# Install Dependency

In [1]:
!pip install tqdm   





# Import lib

In [2]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Imad Eddine
[nltk_data]     Hajjane\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
stop_words = set(stopwords.words('english')).difference(set(("never", "not","no")))

In [6]:
pd.set_option('display.max_colwidth', None)

In [7]:
import tensorflow as tf

# Import data and clean it

In [8]:
def removeSpeCara(s:str):
    return re.sub(r"[^a-zA-Z]", "", s) 

#retirer les url
def remove_url(s:str)->str:
    url_pattern = re.compile(r"http?://\S+|https?://\S+|www\.\S+|//S+")
    return url_pattern.sub("r", s)

#retirer les html 
def remove_html(s:str)->str:
    html_pattern = re.compile(r"<.*?>")
    return html_pattern.sub("r", s)

# retirer les emojies
def remove_emoji(s:str)->str:
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF" 
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    return emoji_pattern.sub("r", s)

def clean_and_lemmatize_string(s:str):
    l = []
    s_ = " ".join([remove_html(remove_url(word)) for word in s.split()])
    for word in word_tokenize(s_):
        word_ = removeSpeCara((remove_emoji(word)))
        if not word_ in stop_words and len(word_) > 1:
                l.append(word_.lower())
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    lemma_function = WordNetLemmatizer()
    return " ".join([lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(l)])

In [9]:
dataframe = pd.read_csv(os.path.join("..", "tweets_01-08-2021.csv"))[["id", "text"]]
dataframe_res = dataframe.copy()
dataframe_res["text_clean"] = dataframe_res["text"]

In [None]:
dataframe_res["text_clean"] = dataframe_res["text_clean"].progress_apply(lambda s : clean_and_lemmatize_string(s))
dataframe_res = dataframe_res[dataframe_res["text_clean"].str.len() >= 3]

  7%|â–‹         | 3971/56571 [00:03<00:39, 1332.29it/s]

In [None]:
dataframe_res

# Prediction with CNN and LSTM

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 300

In [None]:
vectorizer = TfidfVectorizer(lowercase=False, sublinear_tf=True, dtype=np.float32)
vectors = vectorizer.fit_transform(dataframe_res["text_clean"] )
terms = vectorizer.get_feature_names_out()

In [None]:
vocab_size = len(terms)
oov_tok = ''
embedding_dim = EMBEDDING_DIM
max_length = MAX_SEQUENCE_LENGTH
padding_type='post'
trunc_type='post'
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(dataframe_res["text_clean"])

text_sequences = tokenizer.texts_to_sequences(dataframe_res["text_clean"])
text_padded = pad_sequences(text_sequences, padding='post', maxlen=max_length)

In [None]:
text_padded.shape

In [None]:
from tensorflow.python.client import device_lib
tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

- **LSTM**

In [None]:
model_lstm = tf.keras.models.load_model(os.path.join("Bi-LSTM","Model", "bidirectional_lstm_NN.h5"))

In [None]:
model_lstm.summary()

In [None]:
dataframe_res["Opinion_lstm"] = np.asarray([("Positive" if prediction>=0.5 else "Negative") for prediction in model_lstm.predict(text_padded)])

- **CNN**

In [None]:
model_cnn = tf.keras.models.load_model(os.path.join("CNN","Model", "CNN_2.h5"))

In [None]:
model_cnn.summary()

In [None]:
dataframe_res["Opinion_cnn"] = np.asarray([("Positive" if prediction>=0.5 else "Negative") for prediction in model_cnn.predict(text_padded)])

In [None]:
dataframe_res = dataframe_res.drop("text_clean", axis=1)

In [None]:
dataframe_res

# Save the dataframe

In [None]:
dataframe_res.to_csv(os.path.join("result","trump_tweet_opinion_cnn_lstm.csv")) 