In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
nlp = spacy.load('en_core_web_sm')
tokenizer = Tokenizer(nlp.vocab)

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [0]:
def analyze_sentiment_vader_lexicon(review, threshold=0.4,
                                    verbose=False):
  # analyze the sentiment for review
  analyzer = SentimentIntensityAnalyzer()
  scores = analyzer.polarity_scores(review)
  # get aggregate scores and final sentiment
  agg_score = scores['compound']
  if agg_score >= threshold:
    final_sentiment = round(scores['compound'], 2)*100
  elif agg_score <= -threshold:
    final_sentiment = round(scores['compound'], 2)*100
  else:
    final_sentiment = round(scores['neu'], 2)*100
  if verbose:
    # display detailed sentiment statistics
    positive = str(round(scores['pos'], 2)*100)+'%'
    final = round(agg_score, 2)
    negative = str(round(scores['neg'], 2)*100)+'%'
    neutral = str(round(scores['neu'], 2)*100)+'%'
    sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                      negative, neutral]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],['Predicted Sentiment', 'Polarity Score','Positive', 'Negative', 'Neutral']],codes=[[0,0,0,0,0],[0,1,2,3,4]]))
    print(sentiment_frame)
  return scores['neg']

In [6]:
analyze_sentiment_vader_lexicon('bad comment')

0.778

In [0]:
#df = pd.read_csv('https://raw.githubusercontent.com/Saltiest-Hackers/ML-Engineering/master/data/raw/hn_0.csv')

In [34]:
%ls

largedatacomment.csv  latest_model.pkl  [0m[01;34msample_data[0m/


In [0]:
df = pd.read_csv('/content/largedatacomment.csv')

In [37]:
analyze_sentiment_vader_lexicon(df['Comment'][0])

0.0

In [39]:
df = df.dropna(subset=['Comment'])
df.head()

Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted
0,23341390,&gt; Java 8 language support update: APIs you ...,pjmlp,23340887,
1,23342194,I’m still amazed and somewhat annoyed by the f...,krm01,23340887,
2,23341996,"1. The selected Java SDK API desugaring, thoug...",pcx,23340887,
3,23341990,What program did they use to do the SpeakerVid...,TuringNYC,23340887,
4,23341822,&gt; Clangd support for C++<p>That’s interesti...,dgellow,23340887,


In [0]:
df = df.copy()
df['Comment'] = df['Comment'].astype(str)
subset = df.sample(1000)

In [0]:
subset['sent_score'] = subset['Comment'].apply(analyze_sentiment_vader_lexicon)

In [47]:
subset.head()

Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,sent_score
1971,23332590,Why?,edem,23321096,,0.0
227,23334868,Seems interesting how he predicted how social ...,lanevorockz,23334463,,0.0
2648,23326631,"Honestly, it&#x27;s hard for me to take anythi...",dvdhnt,23322793,,0.048
403,23339233,I actually ditched Docker on Windows with the ...,yndoendo,23337898,,0.032
2455,23323168,"Crybaby cries, news at 11.",baggachipz,23322112,True,0.403


In [0]:
authors_rank = (subset.groupby('UserName')['sent_score'].sum())/(subset.groupby('UserName')['Comment'].count())

In [51]:
len(subset['UserName'].unique())

905

In [0]:
def toint(content):
  
  return int(round(content,2)*100)

In [0]:
x_train = subset['Comment'].values
y_train = subset['sent_score'].apply(toint).values

In [17]:
y_train

array([ 0, 14,  0, 15, 19, 17, 10, 12,  4,  4,  0,  0,  0,  0, 21,  4, 28,
        9,  3,  0,  3, 19,  0,  6,  0,  0,  7,  9, 11,  0, 12,  0, 13, 42,
        0, 12,  0,  3,  4,  7,  4,  7,  9,  5, 18,  4,  0,  0, 16, 10,  7,
        8,  0,  6,  6, 20,  6, 43,  0,  0,  7,  7,  1,  0, 13,  0,  0, 10,
        7, 22,  0,  2, 12,  0, 10,  0,  8,  4,  0, 12,  5,  5,  0,  4,  0,
       12, 17, 17,  0,  9, 19,  5,  0, 13,  0, 22, 12, 10,  0, 18,  0,  0,
        4,  0,  0,  6,  3,  0,  6,  8, 10,  0, 17, 12,  0, 27,  7,  0,  9,
       15,  0, 11, 16,  0,  6,  0,  0, 12,  0,  3,  7,  0, 17,  9,  2,  6,
        4,  0,  0,  2,  0,  3,  5,  3,  4,  0,  9, 13,  7, 12,  9,  8,  3,
        5,  1, 14,  5,  3, 18,  0,  0,  0,  0,  4,  0, 10,  0, 26,  0,  0,
       24, 13,  4, 13,  3,  7, 12,  0,  0,  0,  0,  0, 20,  0,  0,  0,  0,
       35,  0,  0,  0,  0,  3, 18,  6,  0,  4,  0,  4,  6,  0, 10,  0,  0,
        3,  0,  5,  0, 15, 21,  0,  0,  0,  0,  4, 28, 10,  0, 27,  0, 36,
        0,  0, 17, 11, 13

In [0]:
tokens = []
for doc in tokenizer.pipe(x_train, batch_size=500):
  doc_tokens = []
  for token in doc:
      doc_tokens.append(token.text.lower())
  tokens.append(doc_tokens)
x_train = tokens

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
x_train = [" ".join(row) for row in x_train]
# X_test = [" ".join(row) for row in X_test]

x_train = vect.fit_transform(x_train)
x_train = pd.DataFrame(x_train.todense(), columns=vect.get_feature_names())
x_train.head()

Unnamed: 0,00,00 application,000,000 adults,000 depending,000 different,000 employees,000 nerds,000 people,000 person,000 preventable,01,01 unnecessary,01 variable,01 x27,01 x2f,0134555,0134555 rel,02,02 x2f,021,021 025h,025h,025h 170,02684527,02684527 2020,03,03 31,039162761,039162761 ebola,0391627617,0391627617 ci23x11,04,04 12,04 x2f,04f5osxk4vw,04f5osxk4vw rel,05,05 19,05 27,...,zettelkasten,zettelkasten href,zettelkasten imho,zettelkasten x2f,zhfpbw,zhfpbw nuwk,zim,zim past,zionists,zionists jews,zip,zip links,zone,zone status,zones,zones place,zoom,zoom built,zoom disabled,zoom look,zoom meetings,zoom x2f,zoox,zoox billion,zoox dumped,zoox x27,zotero,zotero physicists,zotero switched,zotero track,zuckerberg,zuckerberg facebook,ωb,ωb 051,карта,карта спутниковой,обстановки,обстановки rooms,спутниковой,спутниковой обстановки
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=100)

lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
test = vect.transform(['really bad comment','I love this'])

In [59]:
lr.predict(test)

array([0, 0])

In [0]:
x_train = subset['Comment'].values
y_train = subset['sent_score'].apply(toint).values

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression(random_state=101)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(x_train,y_train)
predictions = pipeline.predict(x_train)

In [0]:
subset['pred'] = predictions

In [65]:
subset[subset['pred'] > 6]['Comment'].values

array(['It&#x27;s not that I want to visit the store it&#x27;s that I can&#x27;t trust the store people to properly pick out fresh produce or good meat.<p>When the stock person just takes a bucket of apples and dumps them in without care so most of them are bruised, why would I want that same person selecting which apples to send me?  Often times to find 3 apples I have to examine 10+.  Most of the produce selection is this way.<p>Meat selection is not much different.  Selecting chicken without careful examination you&#x27;ll get broken legs or wings.<p>Now you have to consider the automatic substitution of equivalent items when something is out of stock.  My dibetic friend was telling yesterday that they substituted regular mt dew for his order of mt dew zero sugar.<p>Until the store starts employing people who care about product selection as much as I do, then I&#x27;ll continue to make time to go to the store and pick it myself.',
       '<i>They won&#x27;t spit it out directly of c

In [0]:
import pickle
import os

In [0]:
MODEL_FILEPATH = os.path.join("latest_model.pkl")

In [68]:
print("SAVING THE MODEL...")
with open(MODEL_FILEPATH, "wb") as model_file:
  pickle.dump({"model": pipeline}, model_file)

SAVING THE MODEL...


In [0]:
def load_model():
    print("LOADING THE MODEL...")
    with open(MODEL_FILEPATH, "rb") as model_file:
        saved_model = pickle.load(model_file)
    return saved_model


In [70]:
package = load_model()
model = package['model']
pred = model.predict(x_train)

LOADING THE MODEL...


In [71]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 9, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,