# Fase 0: Carga de datos

In [90]:
import pandas as pd
import nltk
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
#nltk.download('stopwords')
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/alulab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [111]:
df = pd.read_csv("Reviews.csv",nrows=10000)

In [112]:
docs=[]
for index, row in df.iterrows():
    docs.append(row['Summary']+" "+ row['Text'])
docs=np.array(docs)
y=np.array(df['Score'])

# Fase 1: Limpieza de texto

In [124]:
def clean_tokens(text):
    text=text.lower()
    text=" ".join([word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)])
    stopwords_eng = nltk.corpus.stopwords.words('english')
    text=" ".join([token for token in text.split() if token not in stopwords_eng])
    text="".join([ch for ch in text if ch not in list(string.punctuation)])
    text=" ".join([token for token in text.split()])
    
    wnl = nltk.WordNetLemmatizer()
    text=" ".join([wnl.lemmatize(token) for token in text.split()])
    text=" ".join([token for token in text.split() if (len(token)>2) & (len(token)<20)])
    return text.split()

In [125]:
i=2
print(docs[i])
print(clean_tokens(docs[i]))

"Delight" says it all This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.
['delight', 'say', 'confection', 'around', 'century', 'light', 'pillowy', 'citrus', 'gelatin', 'nut', 'case', 'filbert', 'cut', 'tiny', 'square', 'liberally', 'coated', 'powdered', 'sugar', 'tiny', 'mouthful', 'heaven', 'chewy', 'flavorful', 'highly', 'recommend', 'yummy', 'treat', 'familiar', 'story', 'lewis', 'lion', 'witch', 'wardrobe', 'treat', 'seduces', 'edmund', 'selling', 'brother', 'sister', 'witch']


# Fase 2: Vectorizar

### fidf

In [126]:
X_fidf = TfidfVectorizer(tokenizer=clean_tokens, stop_words='english').fit_transform(docs)
X_fidf.shape

(10000, 18912)

### Bag of words

In [127]:
docs_clean=[]
for i in docs:
    docs_clean.append(" ".join(clean_tokens(i)))
    
docs_clean=np.array(docs_clean)
docs_clean.shape

(10000,)

In [128]:
vectorizer = CountVectorizer()
X_bag = vectorizer.fit_transform(docs_clean)
X_bag.shape

(10000, 19145)

# Fase 3: Entrenar

In [129]:
#Ejemplos de algoritmos que pueden probar (pueden usar otros si desean)
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [130]:
def run_model(clf, X, y):
    scores = cross_val_score(clf, X, y, cv=5)
    print("%s accuracy: %0.2f (+/- %0.2f)" % \
          (str(clf.__class__).split('.')[-1].replace('>','').replace("'",''), 
          scores.mean(), scores.std() * 2))

In [131]:
def run_models(X, y):
    run_model(LinearSVC(), X, y)
    run_model(SGDClassifier(), X, y)
    run_model(Perceptron(), X, y)
    run_model(PassiveAggressiveClassifier(), X, y)
    run_model(BernoulliNB(), X, y)
    run_model(MultinomialNB(), X, y)
    run_model(KNeighborsClassifier(), X, y)
    run_model(NearestCentroid(), X, y)
    run_model(RandomForestClassifier(n_estimators=100, max_depth=10), X, y)

In [132]:
print(X_fidf.shape,y.shape)
#print(X_spacy.shape,y.shape)
print(X_bag.shape,y.shape)

(10000, 18912) (10000,)
(10000, 19145) (10000,)


In [133]:
run_models(X_fidf, y)

LinearSVC accuracy: 0.67 (+/- 0.01)
SGDClassifier accuracy: 0.68 (+/- 0.02)
Perceptron accuracy: 0.61 (+/- 0.03)
PassiveAggressiveClassifier accuracy: 0.64 (+/- 0.02)
BernoulliNB accuracy: 0.61 (+/- 0.01)
MultinomialNB accuracy: 0.62 (+/- 0.00)
KNeighborsClassifier accuracy: 0.57 (+/- 0.03)
NearestCentroid accuracy: 0.56 (+/- 0.08)
RandomForestClassifier accuracy: 0.62 (+/- 0.00)


In [134]:
run_models(X_bag, y)

LinearSVC accuracy: 0.62 (+/- 0.03)
SGDClassifier accuracy: 0.63 (+/- 0.03)
Perceptron accuracy: 0.61 (+/- 0.02)
PassiveAggressiveClassifier accuracy: 0.64 (+/- 0.02)
BernoulliNB accuracy: 0.61 (+/- 0.02)
MultinomialNB accuracy: 0.64 (+/- 0.02)
KNeighborsClassifier accuracy: 0.59 (+/- 0.02)
NearestCentroid accuracy: 0.54 (+/- 0.05)
RandomForestClassifier accuracy: 0.62 (+/- 0.00)
