In [None]:
!ls "drive/MyDrive/Diplomatura Data Science"

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
yelp_path = 'drive/MyDrive/Diplomatura Data Science/reviews_yelp_nn.csv'
cols = ['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text', 'useful', 'user_id']

#def date_parser(serie):
#    return serie.astype(datetime64)

print('----Iniciando carga del dataset----')

#df = pd.read_csv(yelp_path,sep='\t',header=0)
#df = pd.read_csv(yelp_path,sep='\t',header=0,low_memory=False)
df = pd.read_csv(yelp_path,sep='\t',header=0,usecols=cols)
#df = pd.read_csv(yelp_path,sep='\t',usecols=cols,date_parser= date_parser,parse_dates=['date'])
#df = pd.read_csv(yelp_path,sep='\t',header=0,dtype={"business_id": "string", "cool": float, "date": "string", "funny": float, "review_id": "string", "stars": float, "text": "string", "useful": float, "user_id": "string"})

print('----Carga finalizada----')


In [None]:
yelp_path = 'drive/MyDrive/Diplomatura Data Science/reviews_yelp_nn.csv'
cols = ['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text', 'useful', 'user_id']

def conv(val):
    if not val:
        return 0    
    try:
        return np.float64(val)
    except:        
        return np.float64(0)

print('----Iniciando carga del dataset----')

df = pd.read_csv(yelp_path,sep='\t',header=0,usecols=cols,converters={'cool':conv})

print('----Carga finalizada----')


In [None]:
df.shape

In [None]:
df.count()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.head()

Chequeamos si hay datos faltantes

In [None]:
df.isna().sum().any()

In [None]:
df.isna().sum() > 0

In [None]:
sns.heatmap(df[df.columns[0:8]].isnull())

In [None]:
df['cool'].isna().sum()

In [None]:
df['date'].isna().sum()

In [None]:
df['funny'].isna().sum()

In [None]:
df['review_id'].isna().sum()

In [None]:
df['stars'].isna().sum()

In [None]:
df['text'].isna().sum()

In [None]:
df['useful'].isna().sum()

In [None]:
df['user_id'].isna().sum()

In [None]:
df['cool'].value_counts(normalize=False,dropna=False)

In [None]:
df[df.cool<0].value_counts()

In [None]:
df['funny'].value_counts(normalize=False,dropna=False)

In [None]:
df['stars'].value_counts(normalize=False,dropna=False)

In [None]:
df['useful'].value_counts(normalize=False,dropna=False)

In [None]:
df[df.useful<0].value_counts()

Chequeamos que columnas contienen un mismo dato en todas las filas

In [None]:
[columna for columna in df.columns if df[columna].nunique()==1]

Chequeamos si hay datos duplicados

In [None]:
df.duplicated().any()

In [None]:
for col in df.select_dtypes('number').columns:
    print(col,sum(df[col]>df[col].quantile(0.95)))

Limpieza de datos

In [None]:
#Podemos borrar todas las filas con datos faltantes ya que la cantidad no es significativa
df.dropna(inplace=True)
#df.dropna(subset=['stars'], inplace=True)

In [None]:
df.drop(df[df.cool < 0].index, inplace=True)

In [None]:
df.drop(df[df.useful < 0].index, inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
stval = df.groupby('stars').mean()
stval

In [None]:
stval.corr()

In [None]:
sns.heatmap(data=stval.corr(), annot=True)

En este punto definimos con que columnas vamos a continuar trabajando

Para nuestro proyecto, las mas relevantes pueden ser:
'review_id', 'stars', 'text', 'date'

In [None]:
df.drop(columns=['business_id','cool','funny','useful','user_id'],inplace=True)

In [None]:
df.info()

Chequeamos la distribucion de los datos de la columna STARS para definir el TARGET

In [None]:
# Check rating distribution
fig = plt.figure(figsize = (6, 4))

title = fig.suptitle("Rating distribution for stars", fontsize=14)

fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("Rating")
ax.set_ylabel("Proportion") 
w_q = df['stars'].value_counts(normalize=True)
w_q = (list(w_q.index), list(w_q.values))
ax.tick_params(axis='both', which='major', labelsize=8.5)
bar = ax.bar(w_q[0], w_q[1], color='steelblue')

In [None]:
def get_target(star):
    return 'positive' if star > 3 else 'negative'

In [None]:
df['target'] = df['stars'].apply( lambda x: get_target(x) )

In [None]:
df.tail()

In [None]:
df['target'].value_counts(normalize=False,dropna=False)

In [None]:
target = df['target'].value_counts(normalize=True)

plt.title('Sentiment feedback')
desplaz = (0, 0.1)
etiquetas = ('Positivo','Negativo')
colores = ('lightblue','yellow')
plt.pie(target,explode=desplaz,labels=etiquetas,colors=colores,autopct='%.1f%%');

Para no trabajar con un volumen de datos tan grande, preparamos una muestra en base a los pesos especificos de cada estrella

In [None]:
print('Proporcion por cada cada tipo de estrella:')
df.groupby(['stars'])['stars'].count()/df['stars'].count().sum()

In [None]:
weights = [
    {'id':1, 'weight': 0.149914},
    {'id':2, 'weight': 0.081144},
    {'id':3, 'weight': 0.110502},
    {'id':4, 'weight': 0.219540},
    {'id':5, 'weight': 0.438900}
]

def getWeight(rate):
    for x in weights:
        if rate == x['id']:
            return x['weight']
        

df['weights'] = df['stars'].apply( lambda x: getWeight(x) )

In [None]:
df.head()

In [None]:
sample = df.sample(n=100000, weights='weights', random_state=1)
sample.head()

In [None]:
# validemos la distribución del sample

ratings = sample['stars'] # data from the sample

fig = plt.figure(figsize = (6, 4))

title = fig.suptitle("Distribucion de estrellas en la muestra", fontsize=14)

fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("Rating")
ax.set_ylabel("Proportion") 
w_q = ratings.value_counts(normalize=True)
w_q = (list(w_q.index), list(w_q.values))
ax.tick_params(axis='both', which='major', labelsize=8.5)
bar = ax.bar(w_q[0], w_q[1], color='steelblue')

Calculamos la longitud del texto

In [None]:
sample['length'] = sample['text'].apply(len)
sample.head()

In [None]:
graph = sns.FacetGrid(data=sample,col='stars')
graph.map(plt.hist,'length',bins=25,color='blue')
plt.show()

In [None]:
sns.boxplot(x='stars', y='length', data=sample)

Vectorizacion

In [None]:
import re
import nltk
nltk.download('stopwords')
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [None]:
def text_prep(text):
    # filer out non-letters and lowercase them
    text = re.sub('[^a-z\s]', '', text.lower())
    # remove stopwords
    text = [w for w in text.split() if w not in stopwords.words('english')]
    return ' '.join(text)

In [None]:
# Definimos countvectorizer para distintos ngrams
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b') 
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')

#bow_converter = CountVectorizer(analyzer=text_prep) 
#bigram_converter = CountVectorizer(ngram_range=(2,2), analyzer=text_prep)
#trigram_converter = CountVectorizer(ngram_range=(3,3), analyzer=text_prep)

In [None]:
sample = df.sample(n=10000, random_state=1)

In [None]:
sample.head()

In [None]:
X = sample['text']
y = sample['stars']

In [None]:
#print(sample['text'].iloc[0])
print(X.iloc[0])

In [None]:
text_prep(X.iloc[0])

In [None]:
# Aplicamos Fit al transformador
bow_converter.fit(X.values.astype('U'))
words = bow_converter.get_feature_names()

In [None]:
bigram_converter.fit(X.values.astype('U'))
bigrams = bigram_converter.get_feature_names() 

In [None]:
trigram_converter.fit(X.values.astype('U'))
trigrams = trigram_converter.get_feature_names()

In [None]:
print (len(words), len(bigrams), len(trigrams))

In [None]:
words[1000:1010]

In [None]:
print(words)

In [None]:
bigrams[10000:10010]

In [None]:
print(bigrams)

In [None]:
trigrams[100000:100010]

In [None]:
words[50000]

In [None]:
import seaborn as sns

In [None]:
sns.set_style("darkgrid")
counts = [len(words), len(bigrams), len(trigrams)]
plt.plot(counts, color='cornflowerblue')
plt.plot(counts, 'bo')
plt.margins(0.1)
plt.xticks(range(3), ['unigram', 'bigram', 'trigram'])
plt.tick_params(labelsize=14)
plt.title('Numero de ngrams dentro del sample de 10mil reviews', {'fontsize':16})
plt.show()

In [None]:
#pd.set_option('mode.chained_assignment', None)
#X['text_processed'] = X.apply(text_prep)

In [None]:
#sample.head()

In [None]:
wc = WordCloud(width=1600, height=800, random_state=42, max_words=1000000)

# generation
wc.generate(str(X))

plt.figure(figsize=(15,10), facecolor='black')
plt.title("Reviews", fontsize=40, color='white')
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=10)

In [None]:
review = text_prep(X.iloc[1000])
review

In [None]:
bow_review = bow_converter.transform([review])
print(bow_review)

In [None]:
print(bow_converter.get_feature_names()[26223])

In [None]:
X = bow_converter.transform(X)

In [None]:
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero occurrences: ', X.nnz)# Percentage of non-zero values
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))

Training/Test data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

Training the model

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

Testing the model

In [None]:
preds = nb.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, preds))
print('\n')
print(classification_report(y_test, preds))

Predictions

In [None]:
review_to_predict = 'This is a great place'
#review_to_predict = 'This is a poor place'

review_to_predict

In [None]:
review_to_predict_transformed = bow_converter.transform([review_to_predict])

nb.predict(review_to_predict_transformed)[0]