In [20]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from models import CleanData

warnings.filterwarnings("ignore")

In [2]:
test = pd.read_csv("data/tweets_clean.csv")
positive = pd.read_csv("data/tweets_pos_clean.csv")
negative = pd.read_csv("data/tweets_neg_clean.csv")

In [3]:
positive["Target"] = [0 for i in positive["Tweets"]]
negative["Target"] = [1 for i in negative["Tweets"]]

In [28]:
print("Positives Tweets:", len(positive))
print("Negative Tweets:", len(negative))

Positives Tweets: 55056
Negative Tweets: 120948


In [4]:
df = positive.merge(negative, how="outer")
df

Unnamed: 0,Tweets,Target
0,Se imaginan a los chicos agradeciendo por el p...,0
1,"Eclesiastes4:9-12 ♡ Siempre, promesa :) https...",0
2,"@pedroj_ramirez Qué saborío, PJ. ya no compart...",0
3,Buenos dias para todos. Feliz inicio de semana...,0
4,"@pepedom @bquintero Gracias! No es así, deja c...",0
...,...,...
175999,Pero... Dime que no te perderé del todo :( ❤💛💚,1
176000,Yo creo que a Colocolo le hacía falta un parti...,1
176001,@seru15 son para niño :( quisiera quedarmelos.,1
176002,Diganle al sonidero que ya le baje a su desmad...,1


In [6]:
df["Tweets"] = df["Tweets"].apply(CleanData().remove_links)
df["Tweets"] = df["Tweets"].apply(CleanData().clean_emojis)
df["Tweets"] = df["Tweets"].apply(CleanData().remove_stopwords)
df["Tweets"] = df["Tweets"].apply(CleanData().signs_tweets)
df["Tweets"] = df["Tweets"].apply(CleanData().remove_doubles)
df["Tweets"] = df["Tweets"].apply(CleanData().clean_laughs)
df["Tweets"] = df["Tweets"].apply(CleanData().remove_mentions_hashtags)

In [7]:
df.to_csv("data/data_cleaned.csv", index=False)

In [8]:
df[["Tweets"]]

Unnamed: 0,Tweets
0,se imaginan chicos agradeciendo premio cara or...
1,eclesiastes siempre promesa {link}
2,pedroj_ramirez qué saborío pj compartes ginton...
3,buenos dias todos feliz inicio semana {link}
4,pepedom bquintero gracias no así deja claro a...
...,...
175999,pero dime perderé
176000,yo creo colocolo hacía falta partido así mas p...
176001,seru niño quisiera quedarmelos
176002,diganle sonidero baje desmadre


In [9]:
df.Tweets[170001]

'jbartolomero cómo ves economía asiática hasta punto crees q afectará a da miedoo  buff'

In [10]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [14]:
#####################################################################################################################################

logistic_pipe = Pipeline([("vect", vectorizer), ("cls", LogisticRegression())]) # Logistic Regression

logistic_params = {"vect__max_df": (0.5, 1 , 1.9), "vect__min_df": (5, 10, 20,50), "vect__max_features": (500, 1000), 
            "cls__penalty": ["l1","l2"], "cls__C": [0.1, 0.5, 1.0, 5.0]}

log_reg = GridSearchCV(logistic_pipe, logistic_params, cv=5, scoring="accuracy")

#####################################################################################################################################

tree_pipe = Pipeline([("vect", vectorizer), ("cls", DecisionTreeClassifier())]) # Decision Tree Classifier

tree_params = {"cls__criterion" : ["gini", "entropy"], "cls__max_depth" : [i for i in range(3,12)], "cls__max_features" : [2,3]}

tree = GridSearchCV(tree_pipe, tree_params, cv=5, scoring="accuracy")

#####################################################################################################################################

svc_pipe = Pipeline([("vect", vectorizer), ("cls", LinearSVC())]) # Linear Support Vector Machine

svc_params = {"cls__C": [0.001, 0.1, 1, 10, 100], "cls__gamma": ["scale", "auto"], "cls__penalty" : ["l1", "l2"]}

svc = GridSearchCV(svc_pipe, svc_params, cv=5, scoring="accuracy")

#####################################################################################################################################

forest = Pipeline([("vect", vectorizer), ("cls", RandomForestClassifier())]) # Random Forest Classifier

#####################################################################################################################################

In [21]:
X, y = shuffle(df["Tweets"], df["Target"], random_state=24)

In [22]:
log_reg.fit(X, y)

KeyboardInterrupt: 

In [None]:
tree.fit(X, y)

In [None]:
svc.fit(X, y)

In [None]:
forest.fit(X, y)