In [0]:
import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

In [0]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
nlp = spacy.load('en_core_web_sm')
tokenizer = Tokenizer(nlp.vocab)

In [0]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [0]:
def analyze_sentiment_vader_lexicon(review, threshold=0.4,
                                    verbose=False):
  # analyze the sentiment for review
  analyzer = SentimentIntensityAnalyzer()
  scores = analyzer.polarity_scores(review)
  # get aggregate scores and final sentiment
  agg_score = scores['compound']
  if agg_score >= threshold:
    final_sentiment = round(scores['compound'], 2)*100
  elif agg_score <= -threshold:
    final_sentiment = round(scores['compound'], 2)*100
  else:
    final_sentiment = round(scores['neu'], 2)*100
  if verbose:
    # display detailed sentiment statistics
    positive = str(round(scores['pos'], 2)*100)+'%'
    final = round(agg_score, 2)
    negative = str(round(scores['neg'], 2)*100)+'%'
    neutral = str(round(scores['neu'], 2)*100)+'%'
    sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                      negative, neutral]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],['Predicted Sentiment', 'Polarity Score','Positive', 'Negative', 'Neutral']],codes=[[0,0,0,0,0],[0,1,2,3,4]]))
    print(sentiment_frame)
  return scores['neg']

In [0]:
analyze_sentiment_vader_lexicon('bad comment')

0.778

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/Saltiest-Hackers/ML-Engineering/master/data/raw/hn_0.csv')

In [0]:
analyze_sentiment_vader_lexicon(df['text'][0])

0.062

In [0]:
df = df.dropna(subset=['text'])
df.head()

Unnamed: 0.1,Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,0,10379195,rhaps0dy,rhaps0dy,1444725842,2015-10-13 08:44:02+00:00,I&#x27;m not sure how much of it has been prov...,10379167,,,0
1,1,10379193,pixelHD,pixelHD,1444725804,2015-10-13 08:43:24+00:00,Minecraft eh? I did the same with GTA San Andr...,10377203,,,0
2,2,10379192,test1235,test1235,1444725705,2015-10-13 08:41:45+00:00,I think the closest universal interest for me ...,10372063,,,0
3,3,10379190,copsarebastards,copsarebastards,1444725645,2015-10-13 08:40:45+00:00,&gt; Why do you care about the success of the ...,10376640,,,0
4,4,10379189,spike021,spike021,1444725623,2015-10-13 08:40:23+00:00,What kinds of ramifications would there be if ...,10378759,,,1


In [0]:
df['text'] = df['text'].astype(str)
subset = df.sample(1000)

In [0]:
subset['sent_score'] = subset['text'].apply(analyze_sentiment_vader_lexicon)

In [0]:
subset.head()

Unnamed: 0.1,Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking,sent_score
97122,97122,10261144,cdvonstinkpot,cdvonstinkpot,1442951176,2015-09-22 19:46:16+00:00,The big companies might be on one of these occ...,10261043,,,2,0.0
115095,115095,10239230,cryoshon,cryoshon,1442584964,2015-09-18 14:02:44+00:00,"Of course the data was held back, it would hur...",10238112,,,4,0.13
8866,8866,10368501,dpifke,dpifke,1444550751,2015-10-11 08:05:51+00:00,"There&#x27;s a DNSBL for that:<p><a href=""http...",10367653,,,0,0.0
60053,60053,10306460,Fiahil,Fiahil,1443641397,2015-09-30 19:29:57+00:00,"Usually, festivals require that the movie is n...",10306360,,,0,0.0
61802,61802,10304292,bradleyjg,bradleyjg,1443624127,2015-09-30 14:42:07+00:00,Eight restaurants reviewed in Queens in twenty...,10303402,,,1,0.0


In [0]:
authors_rank = (subset.groupby('author')['sent_score'].sum())/(subset.groupby('author')['text'].count())

In [0]:
len(subset['author'].unique())

875

In [0]:
def toint(content):
  
  return int(round(content,2)*100)

In [0]:
x_train = subset['text'].values
y_train = subset['sent_score'].apply(toint).values

In [0]:
y_train

array([ 0, 13,  0,  0,  0,  0,  0,  0,  2,  5,  0,  0,  0,  0,  0,  0,  2,
       11, 14, 10, 18,  9,  0,  0, 13, 35,  4,  7,  3,  0,  0,  2,  0,  9,
        0, 15,  6,  6,  5,  0,  0,  0,  0, 15, 17,  2,  0, 12,  1,  4,  0,
       20,  0,  0,  9,  0,  0,  0,  0,  0,  4, 22,  2,  0,  0,  0,  7,  7,
        0,  0,  0, 10,  0,  0,  6,  0,  9, 11, 12,  0,  0,  0,  6, 13, 26,
        0, 19, 16, 11,  0,  0, 12,  6,  0,  8,  0, 19,  0,  0,  0, 19,  0,
       14,  4,  2,  0, 16, 13,  0,  0,  4,  0,  8,  0, 42,  0,  3, 12, 11,
        9,  2, 15, 12,  6,  0,  0, 21,  6,  0,  0, 21,  0,  7,  0, 10,  6,
       28,  0, 15,  0,  0,  6,  0,  7,  0,  0, 22,  0, 23,  7,  1,  9, 25,
        5,  0,  8,  0,  6,  0,  0, 10,  0,  3,  0, 19,  0,  0,  7, 12, 24,
        0,  7,  0,  0,  0,  1, 12,  8,  0,  7,  0,  5,  2,  6, 15,  0,  5,
        0,  6,  0,  6, 13,  7,  0,  0, 13,  0,  0,  3,  9,  4,  8,  5, 31,
        5,  0,  0, 12, 16,  0,  0,  0,  0,  0,  3,  0,  3, 10,  7,  0,  6,
        8, 14,  0,  7,  0

In [0]:
tokens = []
for doc in tokenizer.pipe(x_train, batch_size=500):
  doc_tokens = []
  for token in doc:
      doc_tokens.append(token.text.lower())
  tokens.append(doc_tokens)
x_train = tokens

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
x_train = [" ".join(row) for row in x_train]
# X_test = [" ".join(row) for row in X_test]

x_train = vect.fit_transform(x_train)
x_train = pd.DataFrame(x_train.todense(), columns=vect.get_feature_names())
x_train.head()

Unnamed: 0,00,00 1983,00 using,000,000 00,000 000,000 fully,000 guess,000 lbs,000 noticeable,000 objects,000 place,000 probably,000 released,000 times,000 vacant,000 year,003,003 code,00s,00s images,01,01 x2f,01191,01191 pdf,015,015 1e5,015 city,02,02 x2f,02551,02551 rel,02551 relevant,03,03 x2f,04,04 sure,04 x2f,05,05 x2f,...,zealand,zealand large,zealand time,zephyr,zephyr directions,zero,zero add,zero caffeine,zero desire,zero nrz,zero phone,zero result,zero school,zero transferred,zeromq,zeromq mongodb,zeros,zeros x27,zfs,zfs perform,zillow,zillow fall,zillow hand,zip,zip file,zombie,zombie cookie,zones,zones supposed,zoning,zoning ordinance,zoomermag,zoomermag com,zzzzzzzzzzzzzzzzzzzz,ʌmma,ʌmma 아빠,아빠,아빠 appa,엄마,엄마 eomma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=100)

lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
test = vect.transform(['really bad comment','I love this'])

In [0]:
lr.predict(test)

array([0, 0])

In [0]:
x_train = subset['text'].values
y_train = subset['sent_score'].apply(toint).values

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression(random_state=101)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(x_train,y_train)
predictions = pipeline.predict(x_train)

In [0]:
subset['pred'] = predictions

In [0]:
subset[subset['pred'] > 6]['text'].values

array(['As someone who is reasonably familiar with term sheets, I feel embarrassed that I never gave much thought to the moral hazard that liquidation-preference clauses create.<p>Viewed from the perspective of a founder, they are a serious annoyance. But from the perspective of the public at large, they are being exploited to engage in pseudo-fraud.<p>Right now, the world treats a company&#x27;s &#x27;valuation&#x27; as a reliable signal of information about how much investors actually believe the company is worth. That information is then integrated into heuristics that influence various people&#x27;s decisions: Will a paper write about a startup? Will a reader pay attention? Will a recruit take the company seriously? A billion dollar valuation goes a long way in each circumstance.<p>The problem is that our collective intuitions are using an outdated algorithm for assessing value. In theory, VCs who invest at a given valuation are providing reliable information by putting their money

In [0]:
import pickle
import os

In [0]:
MODEL_FILEPATH = os.path.join("latest_model.pkl")

In [0]:
print("SAVING THE MODEL...")
with open(MODEL_FILEPATH, "wb") as model_file:
  pickle.dump({"model": pipeline}, model_file)

SAVING THE MODEL...


In [0]:
def load_model():
    print("LOADING THE MODEL...")
    with open(MODEL_FILEPATH, "rb") as model_file:
        saved_model = pickle.load(model_file)
    return saved_model


In [0]:
package = load_model()
model = package['model']
pred = model.predict(x_train)

LOADING THE MODEL...


In [0]:
pred

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  3,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,
        0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,
        0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0