In [1]:
import csv
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
data_train = pd.read_csv("data/dataset/train.csv")
data_test = pd.read_csv("data/dataset/test.csv")

In [3]:
data_train.shape

(3235, 6)

In [4]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               3235 non-null   float64
 1   original_text    3235 non-null   object 
 2   lang             3231 non-null   object 
 3   retweet_count    3231 non-null   object 
 4   original_author  3235 non-null   object 
 5   sentiment_class  3235 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 151.8+ KB


In [5]:
data_train['lang'].fillna(data_train['lang'].mode()[0],inplace=True)
data_train['retweet_count'].fillna(data_train['retweet_count'].mode()[0],inplace=True)

In [6]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               3235 non-null   float64
 1   original_text    3235 non-null   object 
 2   lang             3235 non-null   object 
 3   retweet_count    3235 non-null   object 
 4   original_author  3235 non-null   object 
 5   sentiment_class  3235 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 151.8+ KB


In [7]:
data_train['lang'].describe()

count     3235
unique     232
top         en
freq      2998
Name: lang, dtype: object

In [8]:
len(data_train[(data_train['lang']=='en')])

2998

In [9]:
data_train.drop(['id','lang'],axis=1,inplace=True)

In [10]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   original_text    3235 non-null   object
 1   retweet_count    3235 non-null   object
 2   original_author  3235 non-null   object
 3   sentiment_class  3235 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 101.2+ KB


In [28]:
data_train['original_text'].tail()

3230    To all my sisters ,my sisters -in -law and als...
3231    Happy Mother’s Day to all the Mums, Step Mums,...
3232    Happy Mothers Day to the craziest woman I know...
3233    Happy Mother's Day to my amazing wife! We both...
3234    Wishing you all a safe & happy Mothers Day #mo...
Name: original_text, dtype: object

In [14]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from string import punctuation

In [15]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [57]:
x_train = data_train['original_text']
y_train = data_train['sentiment_class']

In [143]:
#EXPRESIONES REGULARES
import re

isURL = re.compile(r'http[s]?:// (?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', re.VERBOSE | re.IGNORECASE)
isURLTw = re.compile(r'pic.(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', re.VERBOSE | re.IGNORECASE)
isRTusername = re.compile(r'^RT+[\s]+(@[\w_]+:)',re.VERBOSE | re.IGNORECASE)

def clean_tweet(text):
    if isURL.search(text):
        index = isURL.search(text).span()
        text = text[0:index[0]]
    text = isURL.sub("",text)
    if isURLTw.search(text):
        index = isURLTw.search(text).span()
        text = text[0:index[0]]
    text = isURLTw.sub("",text)
    
    RT_text = isRTusername.search(text)
    text = isRTusername.sub("",text)
    
    En_text = []
    for item in [i for i in text.split() if i.startswith("@") ]:
        text = re.sub(item,"",text)
        item = item[1:]
        item = ''.join([c for c in item if c not in punctuation])
        En_text.append(item)
    
    Hash_text = []
    for item in [i[1:]  for i in text.split() if i.startswith("#") ]:
        text = re.sub(item,"",text)
        item = item[1:]
        item = ''.join([c for c in item if c not in punctuation])
        Hash_text.append(item)
    
    text = text.lower()
    text = ''.join([c for c in text if c not in punctuation])
    
    dict_final = {"text":text,
                  "entity":RT_text,
                  "persons":En_text,
                  "hashtag":Hash_text
                }
    return dict_final

#NORMALIZE DATA
def normalization(tweet_list):
    lem = WordNetLemmatizer()
    normalized_tweet = []
    for word in tweet_list:
        normalized_text = lem.lemmatize(word,'v')
        normalized_tweet.append(normalized_text)
    return normalized_tweet
#AUN NO USO LA NORMALIZACION CON TODAAAAS LAS PALABRAS, PERO QUIZAS SEA MEJOR

In [148]:
data_final=[]
for x in x_train:
    data_final.append(clean_tweet(x))
x_train_final = pd.DataFrame(data_final)

In [150]:
x_train_final.tail()

Unnamed: 0,text,entity,persons,hashtag
3230,to all my sisters my sisters in law and also m...,,[],[]
3231,happy mother’s day to all the mums step mums a...,,[],"[airbyemmabroadbent, airstylist, airdresser, a..."
3232,happy mothers day to the craziest woman i know...,,[],[]
3233,happy mothers day to my amazing wife we both l...,,[],"[othersDay, otheringSunday]"
3234,wishing you all a safe happy mothers day ...,,[Reedham],"[othersday, eedham, elax]"


In [168]:
MAX_WORDS = 100000 #NUMERO DE PALABRAS TOTALES
MAX_LEN = 80 #NUMERO DE PALABRAS EN UN TEXT
#TOKENIZACION
token = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
token.fit_on_texts(list(x_train_final['text']))
#lista de secuencias
x_train_text = token.texts_to_sequences(x_train_final['text'])
x_train_text = pad_sequences(x_train_text,maxlen=MAX_LEN)

word_in = token.word_index

In [169]:
x = pd.DataFrame(x_train_text)
y = pd.get_dummies(y_train).values

In [170]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, Y1_train, Y1_test = train_test_split(x,y, random_state = 42)
print(X1_train.shape,Y1_train.shape)
print(X1_test.shape,Y1_test.shape)

(2426, 80) (2426, 3)
(809, 80) (809, 3)


In [177]:
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM,Dropout
from keras.utils.np_utils import to_categorical
max_fatures = MAX_WORDS
#https://blog.usejournal.com/sentiment-classification-with-natural-language-processing-on-lstm-4dc0497c1f19
embed_dim = 150
lstm_out = 200
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = x.shape[1]))
model.add(Dropout(0.2))
model.add(LSTM(lstm_out))
model.add(Dropout(0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 150)           15000000  
_________________________________________________________________
dropout_2 (Dropout)          (None, 80, 150)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 603       
Total params: 15,281,403
Trainable params: 15,281,403
Non-trainable params: 0
_________________________________________________________________
None


In [178]:
batch_size = 32
model.fit(X1_train, Y1_train, nb_epoch = 3, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X1_test, Y1_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

Train on 2426 samples
Epoch 1/3
2426/2426 - 49s - loss: 1.0373 - accuracy: 0.5293
Epoch 2/3
2426/2426 - 28s - loss: 0.9758 - accuracy: 0.5379
Epoch 3/3
2426/2426 - 30s - loss: 0.7452 - accuracy: 0.6851
809/809 - 12s - loss: 1.1461 - accuracy: 0.4116
score: 1.15
acc: 0.41
