In [23]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, warnings
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from models import CleanData

warnings.filterwarnings("ignore")

In [2]:
positive = pd.read_csv("data/tweets_pos_clean.csv")
negative = pd.read_csv("data/tweets_neg_clean.csv")

In [3]:
positive["Target"] = [0 for i in positive["Tweets"]]
negative["Target"] = [1 for i in negative["Tweets"]]

In [4]:
print("Positives Tweets:", len(positive))
print("Negative Tweets:", len(negative))

Positives Tweets: 55056
Negative Tweets: 120948


In [5]:
df = positive.merge(negative, how="outer")
df

Unnamed: 0,Tweets,Target
0,Se imaginan a los chicos agradeciendo por el p...,0
1,"Eclesiastes4:9-12 ♡ Siempre, promesa :) https...",0
2,"@pedroj_ramirez Qué saborío, PJ. ya no compart...",0
3,Buenos dias para todos. Feliz inicio de semana...,0
4,"@pepedom @bquintero Gracias! No es así, deja c...",0
...,...,...
175999,Pero... Dime que no te perderé del todo :( ❤💛💚,1
176000,Yo creo que a Colocolo le hacía falta un parti...,1
176001,@seru15 son para niño :( quisiera quedarmelos.,1
176002,Diganle al sonidero que ya le baje a su desmad...,1


In [6]:
# df["Tweets"] = df["Tweets"].apply(CleanData().remove_links)
# df["Tweets"] = df["Tweets"].apply(CleanData().clean_emojis)
# df["Tweets"] = df["Tweets"].apply(CleanData().remove_stopwords)
# df["Tweets"] = df["Tweets"].apply(CleanData().signs_tweets)
# df["Tweets"] = df["Tweets"].apply(CleanData().remove_doubles)
# df["Tweets"] = df["Tweets"].apply(CleanData().clean_laughs)
# df["Tweets"] = df["Tweets"].apply(CleanData().remove_mentions_hashtags)

# df.to_csv("data/data_cleaned.csv", index=False)

In [7]:
df = pd.read_csv("data/data_cleaned.csv").dropna()

In [8]:
df[["Tweets"]]

Unnamed: 0,Tweets
0,se imaginan chicos agradeciendo premio cara or...
1,eclesiastes siempre promesa {link}
2,pedroj_ramirez qué saborío pj compartes ginton...
3,buenos dias todos feliz inicio semana {link}
4,pepedom bquintero gracias no así deja claro a...
...,...
175999,pero dime perderé
176000,yo creo colocolo hacía falta partido así mas p...
176001,seru niño quisiera quedarmelos
176002,diganle sonidero baje desmadre


In [9]:
df.Tweets[170001]

'jbartolomero cómo ves economía asiática hasta punto crees q afectará a da miedoo  buff'

In [10]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [24]:
#####################################################################################################################################

logistic_pipe = Pipeline([("vect", vectorizer), ("cls", LogisticRegression())]) # Logistic Regression

logistic_params = {"vect__max_df": (0.5, 1), "vect__min_df": (10, 20, 50), "cls__penalty": ["l1","l2"], 
"cls__C": [0.1, 1.0], "cls__solver" : ["newton-cg"]}

log_reg = GridSearchCV(logistic_pipe, logistic_params, cv=3, scoring="accuracy")

#####################################################################################################################################

svc_pipe = Pipeline([("vect", vectorizer), ("cls", LinearSVC())]) # Linear Support Vector Machine

svc_params = {"cls__C": [0.001, 0.1, 1, 10, 100], "cls__loss": ["hinge", "squared_hinge"], "cls__penalty" : ["l1", "l2"]}

svc = GridSearchCV(svc_pipe, svc_params, cv=3, scoring="accuracy")

#####################################################################################################################################

xgb = Pipeline([("vect", vectorizer), ("cls", XGBClassifier())]) # XGB Classifier

#####################################################################################################################################

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df["Tweets"], df["Target"], test_size=0.20, random_state=24)

In [13]:
log_reg.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(ngram_range=(1, 2))),
                                       ('cls', LogisticRegression())]),
             param_grid={'cls__C': [0.1, 1.0], 'cls__penalty': ['l1', 'l2'],
                         'cls__solver': ['newton-cg'], 'vect__max_df': (0.5, 1),
                         'vect__min_df': (10, 20, 50)},
             scoring='accuracy')

In [14]:
log_reg_predictions = log_reg.predict(X_test)

log_reg_accuraccy = accuracy_score(log_reg_predictions, y_test)

print(log_reg_accuraccy)

0.8112231851115215


In [15]:
svc.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(ngram_range=(1, 2))),
                                       ('cls', LinearSVC())]),
             param_grid={'cls__C': [0.001, 0.1, 1, 10, 100],
                         'cls__loss': ['hinge', 'squared_hinge'],
                         'cls__penalty': ['l1', 'l2']},
             scoring='accuracy')

In [16]:
svc_predictions = svc.predict(X_test)

svc_accuraccy = accuracy_score(svc_predictions, y_test)

print(svc_accuraccy)

0.8177013780366529


In [25]:
xgb.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('cls',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state=0, reg_alpha=0,
 

In [26]:
xgb_predictions = xgb.predict(X_test)

xgb_accuraccy = accuracy_score(xgb_predictions, y_test)

print(xgb_accuraccy)

0.791589714448075


In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras

In [30]:
# Converting the strings into integers using Tokenizer

max_vocab = 20000000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)

In [34]:
# Checking the word index and find out the vocabulary of the dataset

wordidx = tokenizer.word_index

print(f"The size of dataset vocab is: {len(wordidx)}")

The size of dataset vocab is: 128416


In [33]:
# Converting train and test sentences into sequences

train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)
print(f"Train sequence: {train_seq[0]}")
print(f"Test sequence: {test_seq[0]}")

Train sequence: [50, 67, 72, 36, 16, 1]
Test sequence: [675, 21]


In [36]:
# Padding the sentences to get equal length sequence because it's conventional to use same size sequences

# Padding Train
pad_train = pad_sequences(train_seq)

print(f"The len of train sequence is: {pad_train.shape[1]}")


# Padding test
pad_test = pad_sequences(test_seq, maxlen=pad_train.shape[1])

print(f"The len of test sequence is: {pad_test.shape[1]}")

The len of train sequence is: 2095
The len of test sequence is: 2095


In [45]:
# Building the neural network

input_len = keras.layers.Input(shape=(pad_train.shape[1], ))

x = keras.layers.Embedding(len(wordidx) + 1, 20)(input_len) # len(wordidx) + 1 because the indexing starts from 1, not from 0

x = keras.layers.LSTM(25, return_sequences=True)(x)

x = keras.layers.GlobalMaxPool1D()(x)

x = keras.layers.Dense(32, activation="relu")(x)

x = keras.layers.Dense(1, activation="sigmoid")(x)

neural_network_model = keras.Model(input_len, x)

In [46]:
# Compiling the model

neural_network_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

earlystop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
mcheckpoint = keras.callbacks.ModelCheckpoint("data/models/neural_network.h5")

In [47]:
# Training the model

history = neural_network_model.fit(pad_train, y_train, validation_data=(pad_test, y_test), epochs=10, callbacks=[earlystop, mcheckpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [48]:
neural_network_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 2095)]            0         
                                                                 
 embedding_3 (Embedding)     (None, 2095, 20)          2568340   
                                                                 
 lstm_3 (LSTM)               (None, 2095, 25)          4600      
                                                                 
 global_max_pooling1d_3 (Glo  (None, 25)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 32)                832       
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                           

In [50]:
neural_network_model.evaluate(pad_test, y_test)



[0.41043326258659363, 0.8165932893753052]