## [75.06 / 95.58] Organización de Datos
## Trabajo Práctico 2: Competencia de Machine Learning
### Grupo 18: DATAVID-20

* 102732 - Bilbao, Manuel
* 101933 - Karagoz, Filyan
* 98684 - Markarian, Darío
* 100901 - Stroia, Lautaro

## 1. Importación general de librerias y set-up de datos.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re as re
import os

#Para instalar gensim
#! pip3 install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords

#Instalar tensorflow
#!pip3 install tensorflow
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

#Para XGBoost
#! pip3 install xgboost
from xgboost import XGBClassifier

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score



import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows = None #mostrar todas las filas del df
%matplotlib inline
plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs


### Limpieza de datos
**Definicion de funciones auxiliares**

In [2]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

### Carga de datos

In [3]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [4]:
for data in [test,train]:
    data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
    data['text'] = data['text'].apply(lambda x: minusculas(x))
    data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
    data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
    data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
    data['text'] = data['text'].apply(lambda x: eliminar_url(x))


In [5]:
#Vectorizacion
X_train,X_valid,y_train,y_valid = train_test_split(train.text, train.target, test_size=0.33, 
                                                  random_state=42)
print(X_train.shape)
print(X_valid.shape)

(5100,)
(2513,)


In [6]:
#Concateno los tweets
data_text = pd.concat([train.text,test.text])

#Tokenizacion, ya que para entrenar xgboost, necesita vectors con valores numericos
vector = CountVectorizer()
vector.fit(data_text)
X_train_vec = vector.transform(X_train)
X_valid_vec = vector.transform(X_valid)

#Entrenamiento del XGB Classifier
model = XGBClassifier()
model.fit(X_train_vec, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
#Predicciones sobre set de training
pred1 = model.predict(X_valid_vec)
print("Accuracy: ", np.round(accuracy_score(y_valid,pred1),5))
print("F1 Score :", np.round(f1_score(y_valid, pred1),5))

pred2 = model.predict(X_train_vec)
print("Accuracy: ", np.round(accuracy_score(y_train,pred2),5))
print("F1 Score :", np.round(f1_score(y_train, pred2),5))

Accuracy:  0.78034
F1 Score : 0.70291
Accuracy:  0.83118
F1 Score : 0.77781


In [8]:
#Predicciones con el set de TEST y submit -> score 0.777 en kaggle
test_vec = vector.transform(test.text)
y_pred = model.predict(test_vec)

submit = pd.DataFrame(test['id'])
submit['target'] = y_pred
#submit.to_csv('submission-xgboost.csv',index=False)

### Ahora probaremos si cambia el score utilizando la columna de Keyword

In [9]:
#Voy a limpiar la columna de keyword, sabiendo del tp anterior que el unico caracter
#especial a borrar es %20, y voy a eliminar aquellos valores nulos
train2 = train.copy()
test2 = test.copy()
test2['keyword'] = test2['keyword'].fillna('unknown').apply(lambda x: re.sub(r'%20',' ', str(x)))
train2['keyword'] = train2['keyword'].fillna('unknown').apply(lambda x: re.sub(r'%20',' ', str(x)))

train2['combined_text'] = train2['text'] +' '+ train2['keyword']
test2['combined_text'] = test2['text'] +' '+ test2['keyword']

In [10]:
#Vectorizacion
X_train,X_test,y_train,y_test = train_test_split(train2['combined_text'], train2.target, test_size=0.33, 
                                                  random_state=42)
print(X_train.shape)
print(X_test.shape)

(5100,)
(2513,)


In [11]:
#Concateno los tweets y keywords
data_text = pd.concat([train2.combined_text,test2.combined_text])

#Tokenizacion, ya que para entrenar xgboost, necesita vectors con valores numericos
vector = CountVectorizer()
vector.fit(data_text)
X_train_vec = vector.transform(X_train)
X_test_vec = vector.transform(X_test)

#Entrenamiento del XGB Classifier
model1 = XGBClassifier()
model1.fit(X_train_vec, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
#Predicciones sobre set de training
pred1 = model1.predict(X_test_vec)
print("Accuracy: ", np.round(accuracy_score(y_valid,pred1),5))
print("F1 Score :", np.round(f1_score(y_test, pred1),5))

pred2 = model1.predict(X_train_vec)
print("Accuracy: ", np.round(accuracy_score(y_train,pred2),5))
print("F1 Score :", np.round(f1_score(y_train, pred2),5))

Accuracy:  0.77477
F1 Score : 0.69862
Accuracy:  0.83294
F1 Score : 0.77996


In [13]:
#Predicciones con el set de TEST y submit -> score 0.76 en kaggle
test_vec = vector.transform(test2.text)
#y_pred = model.predict(test_vec)

#submit = pd.DataFrame(test['id'])
#submit['target'] = y_pred
#submit.to_csv('submission-xgboost-keywords.csv',index=False)

## Algo de feature engineering

Vamos a agregar columnas numericas como por ejemplo: longitud en caracteres de cada tweet, cantidad de palabras en cada tweet, valor binario que indique si la longitud de ese tweet supera o no el promedio de longitud de tweets

In [14]:
test_features = test.copy()
train_features = train.copy()

for data in [test_features, train_features]:
    data['tweet_len'] = data['text'].str.len()
    data['qty_strings'] = data['text'].apply(lambda x: len(str(x).split()))
    data['len_gt_mean'] = (data['tweet_len'] > data['tweet_len'].mean()).astype(int)
    
X2 = train_features[['text','tweet_len','qty_strings','len_gt_mean']]
y2 = train_features.target
test_features_notarget = test_features[['text','tweet_len','qty_strings','len_gt_mean']]

In [25]:
#Concateno los df de test y train
data_text2 = pd.concat([X2,test_features_notarget])

#Separo los datasets en test y train
X_train2,X_valid2,y_train2,y_valid2 = train_test_split(X2, y2, test_size=0.33, 
                                                  random_state=42)

#Tokenizacion
vector2 = CountVectorizer()
vector2 = vector2.fit(data_text2.text)

#Convierto los sparse matrix que devuelve el vectorizer a dataframe asi puedo agregarle las columnas con 
#los features numericos
X_train_vec2 = pd.DataFrame.sparse.from_spmatrix(vector2.transform(X_train2.text))
X_test_vec2 = pd.DataFrame.sparse.from_spmatrix(vector2.transform(X_valid2.text))

X_train_vec2['tweet_len'] = X_train2['tweet_len']
X_train_vec2['qty_strings'] = X_train2['qty_strings']
X_train_vec2['len_gt_mean'] = X_train2['len_gt_mean']

X_test_vec2['tweet_len'] = X_valid2['tweet_len']
X_test_vec2['qty_strings'] = X_valid2['qty_strings']
X_test_vec2['len_gt_mean'] = X_valid2['len_gt_mean']

#Entrenamiento del XGB Classifier
model2 = XGBClassifier()
model2.fit(X_train_vec2, y_train2)


XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.9, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='reg:logistic', random_state=0, reg_alpha=10,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
#Predicciones sobre set de training
pred1 = model2.predict(X_test_vec2)
print("Accuracy: ", np.round(accuracy_score(y_valid2,pred1),5))
print("F1 Score :", np.round(f1_score(y_valid2, pred1),5))

pred2 = model2.predict(X_train_vec2)
print("Accuracy: ", np.round(accuracy_score(y_train2,pred2),5))
print("F1 Score :", np.round(f1_score(y_train2, pred2),5))

Accuracy:  0.7306
F1 Score : 0.60059
Accuracy:  0.76333
F1 Score : 0.66313


In [27]:
#Predicciones sobre test y submit a kaggle -> score 0.76555 en kaggle con parametros default, 0.73 con 100 arboles
#de decision, tasa de aprendizaje de 0.9, maxima profundidad de cada arbol de 7.
test_vec2 = pd.DataFrame.sparse.from_spmatrix(vector2.transform(test_features.text))
test_vec2['tweet_len'] = test_features_notarget['tweet_len']
test_vec2['qty_strings'] = test_features_notarget['qty_strings']
test_vec2['len_gt_mean'] = test_features_notarget['len_gt_mean']
                                             
prediccion = model2.predict(test_vec2)

submit = pd.DataFrame(test_features['id'])
submit['target'] = prediccion


In [29]:
#submit.to_csv('SUBMITS/submission-xgboost-features2.csv',index=False)