# Aprendizaje Supervisado

Hay dos tipos de modelos que se pueden entrenar:
- Clasificación
- Regresión

Vamos a utilizar 3 algoritmos de clasificación:
- Regresión Logística
- Naive Bayes
- KNN

## Regresión Logística

In [1]:
# Librerías
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import nltk
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Cargamos los datos con los que vamos a trabajar
# Son Reviews de instrumentos musicales vendidos en Amazon
# El archivo está en formato JSON

review_data = pd.read_json('reviews_Musical_Instruments_5.json', lines=True)
review_data[['reviewText', 'overall']].head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5
1,The product does exactly as it should and is q...,5
2,The primary job of this device is to block the...,5
3,Nice windscreen protects my MXL mic and preven...,5
4,This pop filter is great. It looks and perform...,5


In [3]:
lemmatizer = WordNetLemmatizer()
# Lematizamos las palabras
# Quitamos caracteres que no nos sirven. Al final reconstruimos un String con el texto limpio
review_data['cleaned_text'] = review_data['reviewText'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word.lower())
                        for word in word_tokenize(re.sub(r'([^\s\w]|_)+', ' ', str(x)))]))

review_data[['cleaned_text', 'reviewText', 'overall']].head()

Unnamed: 0,cleaned_text,reviewText,overall
0,not much to write about here but it doe exactl...,"Not much to write about here, but it does exac...",5
1,the product doe exactly a it should and is qui...,The product does exactly as it should and is q...,5
2,the primary job of this device is to block the...,The primary job of this device is to block the...,5
3,nice windscreen protects my mxl mic and preven...,Nice windscreen protects my MXL mic and preven...,5
4,this pop filter is great it look and performs ...,This pop filter is great. It looks and perform...,5


In [5]:
# Construimos una representación TFIDF de cada string
# Almacenamos esta representación en un DF
tfidf_model = TfidfVectorizer(max_features=500)
tfidf_df = pd.DataFrame(tfidf_model.fit_transform(
    review_data['cleaned_text']).todense())
tfidf_df.columns = sorted(tfidf_model.vocabulary_)
tfidf_df.head()

Unnamed: 0,10,100,12,20,34,able,about,accurate,acoustic,actually,...,won,work,worked,worth,would,wrong,year,yet,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.159684,0.0,0.0,0.0,...,0.0,0.134327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.085436,0.0,0.0,0.0,0.0,0.0,0.0,0.067074,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.115312,0.0,0.0,0.0,0.07988,0.111989
3,0.0,0.0,0.0,0.0,0.0,0.339573,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303608,0.0


In [6]:
# Creamos una nueva columna para tener una clasificación binaria
# Separamos los reviews con 4 y 5 estrellas para la clase 1 y el resto para la 0
review_data['target'] = review_data['overall'].apply(lambda x: 0 if x<=4 else 1)
review_data['target'].value_counts()

1    6938
0    3323
Name: target, dtype: int64

In [7]:
# Ahora que ya tenemos el Dataset preparado vamos a utilizar una regresión logística
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(tfidf_df, review_data['target'])
predicted_labels = logreg.predict(tfidf_df)
logreg.predict_proba(tfidf_df)[:,1]

array([0.57146961, 0.68579907, 0.56068939, ..., 0.65979968, 0.5495679 ,
       0.21186011])

In [8]:
# Revisamos el resultado de nuestras predicciones usando la función crosstab de pandas
review_data['predicted_labels'] = predicted_labels
pd.crosstab(review_data['target'], review_data['predicted_labels'])

predicted_labels,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1543,1780
1,626,6312


## Clasificador Gaussiano (GaussianNB)

In [9]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(tfidf_df, review_data['target'])
predicted_labels_nb = nb.predict(tfidf_df)
nb.predict_proba(tfidf_df)[:,1]

array([9.97730158e-01, 3.63599675e-09, 9.45692105e-07, ...,
       2.46001047e-02, 3.43660991e-08, 1.72767906e-27])

In [11]:
# Revisamos el resultado de nuestras predicciones usando la función crosstab de pandas
review_data['predicted_labels_nb'] = predicted_labels_nb
pd.crosstab(review_data['target'], review_data['predicted_labels_nb'])

predicted_labels_nb,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2333,990
1,2380,4558


## KNN (KNearestNeighbours)

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_df, review_data['target'])
review_data['predicted_labels_knn'] = knn.predict(tfidf_df)

pd.crosstab(review_data['target'], review_data['predicted_labels_knn'])

predicted_labels_knn,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2681,642
1,333,6605
