# Construyendo un Bag of Words

## Importar librerias

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## Leer Dataset

In [2]:
pd.set_option('display.max_colwidth', -1)
df = pd.read_csv("data_lemmatized.csv")
y = df['sentiment']

## Crear conjunto de entrenamiento y pruebas

### variables de salida (listas)
- **x_train**: conjunto de entrenamiento con todas las columnas excepto la que se quiere predecir ('sentiment'). A esas demás columnas se le conocen como 'features' o 'variables'
- **x_test**: conjunto de entrenamiento, solo con la columna que se quiere predecir ('sentiment'). A esa columna se le llama 'labels'
- **y_train**: conjunto de prueba, solo con los 'features'
- **y_test**: conjunto de prueba, solo con los 'labels'
--------------------------------------------------------
### parametros
- **test_size**: porcentaje en que utilizara el conjunto de entrenamiento en todo el dataset
- **random_state**: es la semilla de aleatoridad. Permite revolver las filas y columnas, pero siempre dando el mismo resultado

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df['data_lemmatized'], y, test_size = 0.33, random_state = 53)

## Transformar textos a Vectores de Bag of Words 

### Inicializacion de la clase CountVectorizer
Aplica preprocesamiento removiendo stopwords

.values.astype('U'):
https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document

In [4]:
count_vectorizer = CountVectorizer(stop_words='english')

### Fit y Transform

In [5]:
# Conjunto de Entrenamiento
count_train = count_vectorizer.fit_transform(x_train.values.astype('U'))

In [6]:
# Conjunto de pruebas
count_test = count_vectorizer.transform(x_test.values.astype('U'))

In [7]:
print(count_vectorizer.get_feature_names()[:30])

['aa', 'aadavantage', 'aadfw', 'aadv', 'aadvantage', 'aal', 'aand', 'aano', 'aas', 'aaso', 'aaus', 'ab', 'aback', 'abandon', 'abandonment', 'abassinet', 'abbreve', 'abc', 'abcdef', 'abcs', 'abduct', 'abilities', 'ability', 'able', 'aboard', 'aboout', 'abound', 'abq', 'abroad', 'absolute']


# Construyendo un TfidfVectorizer

In [8]:
# Importar TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Inicializar al objeto TfidfVectorizer: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english", max_df=0.7)

In [10]:
# Transformar los datos de entrenamiento: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))

# Transformar los datos de prueba: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(x_test.values.astype('U'))

In [11]:
# Imprimir las primeros 10 características
print(tfidf_vectorizer.get_feature_names()[:10])

['aa', 'aadavantage', 'aadfw', 'aadv', 'aadvantage', 'aal', 'aand', 'aano', 'aas', 'aaso']


In [12]:
# Imprimir los primeros 5 vectores de tfidf_train
print(tfidf_train.A[:5])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.23671055 0.         0.         ... 0.         0.         0.        ]]


# Inspeccionando los vectores (Bag of Words, Tfidf)

In [13]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [14]:
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [15]:
# Print the head of count_df
print(count_df.head())

   aa  aadavantage  aadfw  aadv  aadvantage  aal  aand  aano  aas  aaso  ...  \
0  0   0            0      0     0           0    0     0     0    0     ...   
1  0   0            0      0     0           0    0     0     0    0     ...   
2  0   0            0      0     0           0    0     0     0    0     ...   
3  0   0            0      0     0           0    0     0     0    0     ...   
4  1   0            0      0     0           0    0     0     0    0     ...   

   yyzua  zabsonre  zambia  zero  zip  zipper  zone  zoom  zurich  zurichnew  
0  0      0         0       0     0    0       0     0     0       0          
1  0      0         0       0     0    0       0     0     0       0          
2  0      0         0       0     0    0       0     0     0       0          
3  0      0         0       0     0    0       0     0     0       0          
4  0      0         0       0     0    0       0     0     0       0          

[5 rows x 7513 columns]


In [16]:
# Print the head of tfidf_df
print(tfidf_df.head())

         aa  aadavantage  aadfw  aadv  aadvantage  aal  aand  aano  aas  aaso  \
0  0.000000  0.0          0.0    0.0   0.0         0.0  0.0   0.0   0.0  0.0    
1  0.000000  0.0          0.0    0.0   0.0         0.0  0.0   0.0   0.0  0.0    
2  0.000000  0.0          0.0    0.0   0.0         0.0  0.0   0.0   0.0  0.0    
3  0.000000  0.0          0.0    0.0   0.0         0.0  0.0   0.0   0.0  0.0    
4  0.236711  0.0          0.0    0.0   0.0         0.0  0.0   0.0   0.0  0.0    

   ...  yyzua  zabsonre  zambia  zero  zip  zipper  zone  zoom  zurich  \
0  ...  0.0    0.0       0.0     0.0   0.0  0.0     0.0   0.0   0.0      
1  ...  0.0    0.0       0.0     0.0   0.0  0.0     0.0   0.0   0.0      
2  ...  0.0    0.0       0.0     0.0   0.0  0.0     0.0   0.0   0.0      
3  ...  0.0    0.0       0.0     0.0   0.0  0.0     0.0   0.0   0.0      
4  ...  0.0    0.0       0.0     0.0   0.0  0.0     0.0   0.0   0.0      

   zurichnew  
0  0.0        
1  0.0        
2  0.0        
3  0.0  

In [17]:
# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()


In [18]:
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

False


# Training and testing a classification model with Scikit-learn

In [None]:
## WIP