In [1]:
import re
import pandas as pd
import emoji
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def clean_title(x):
    tag = r"([\s\(\)\[\],\/-]*\b((\d{1,2})[\s\(\)\[\],\/-]*([mf]|female|male))\b[\s\(\)\[\],\/-]*|\b[\s\(\)\[\],\/-]*(([mf]|female|male)[\s\(\)\[\],\/-]*(\d{1,2}))\b)"
    age = r"\d{1,2}"
    gender = r"(\b|\d)([mMfF])"
    match = re.sub(tag, '', x, flags=re.I)
    match = emoji.get_emoji_regexp().sub(r'', match)
    match = match.translate(str.maketrans('', '', string.punctuation))
    return match

In [12]:
df = pd.read_csv("../data/posts.csv", index_col="id")
ratings = df[["title", "average_rating"]].dropna(subset=["average_rating"])

X = ratings.title
X = X.apply(clean_title)
y = ((ratings.average_rating - ratings.average_rating.mean()) / ratings.average_rating.std())
y = y > 0

vectorizer = CountVectorizer(min_df=5, ngram_range=(3, 4))
X = vectorizer.fit_transform(X)
print(X.shape)

(27454, 6314)


In [13]:
from sklearn.linear_model import LinearRegression, Ridge

In [14]:
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], reg.coef_[i])) for i in reg.coef_.argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], reg.coef_[i])) for i in (-reg.coef_).argsort()[:10]]))


0.22176161837916128
most negative
('of what others', -2.9494446584391345)
('not having much luck', -2.3051935231436667)
('open to suggestions on', -2.0460379743827244)
('lost some weight recently', -1.9452607893586504)
('would love to get', -1.8634962680878104)
('should do to improve', -1.8150424535037728)
('lose weight and', -1.6980477548008965)
('to look my best', -1.6857245392193496)
('for ways to improve', -1.6805423646361892)
('anything like this before', -1.6731685937545036)
most positive
('been going to the', 2.252834387973641)
('change up my hair', 2.0432869294169578)
('curious of what others', 1.9828167777791539)
('idea what people', 1.9644397127177198)
('of what others think', 1.9642896776969638)
('really know where stand', 1.9420387050656225)
('on what people', 1.8040222175072504)
('me what you got', 1.7811485414478139)
('im wondering if its', 1.7791824609903149)
('be honest can handle', 1.751504900500844)


In [15]:
ridge = Ridge().fit(X, y)
print(ridge.score(X, y))

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], ridge.coef_[i])) for i in ridge.coef_.argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], ridge.coef_[i])) for i in (-ridge.coef_).argsort()[:10]]))


0.19846450703363194
most negative
('comes to my', -0.862983655346476)
('its time to', -0.6081485241767967)
('honestly think of', -0.5914928381833302)
('look better without', -0.5688491724896164)
('decided to post here', -0.5619540267589686)
('and still have', -0.5619491599925692)
('is the only', -0.5601745512441181)
('the process of losing', -0.5588970398251094)
('since last post', -0.5517987065304352)
('pull any punches', -0.5385273167955745)
most positive
('what you would rate', 0.5969205165125211)
('im wondering if its', 0.5895612279483424)
('love to get some', 0.5420970766578329)
('verification pic is', 0.5350903388825903)
('me also do', 0.5332445811538595)
('been going to the', 0.5323343926810438)
('people say look like', 0.5247666120493227)
('and im not sure', 0.5207030233194735)
('version of myself', 0.5067437639397483)
('see what others think', 0.4984005793340002)


In [16]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X, y)

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in clf.coef_[0].argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in (-clf.coef_[0]).argsort()[:10]]))

most negative
('comes to my', -2.092183155986977)
('its time to', -1.8917139205868054)
('look better without', -1.8870581755046518)
('comment on my appearance', -1.839073112718657)
('honestly think of', -1.7752841341467007)
('before want to', -1.6871411219661958)
('pull any punches', -1.5673035223992189)
('the process of losing', -1.5549577427998864)
('to cut my hair', -1.5398031951730102)
('since last post', -1.5371954196791227)
most positive
('and im not sure', 1.7767214630169692)
('what you would rate', 1.7662444457886612)
('im wondering if its', 1.7323504398018408)
('verification pic is', 1.7182538252485915)
('been going to the', 1.7097599059245445)
('love to get some', 1.6220351769063548)
('bad at taking', 1.6100155336814947)
('me what you got', 1.5799349186148854)
('me to do', 1.575397187114585)
('me but be', 1.5512665662408673)


In [17]:
from sklearn.metrics import *

print("Accuracy:", accuracy_score(y, clf.predict(X)))
print("AUROC:", roc_auc_score(y, clf.decision_function(X)))

Accuracy: 0.6911196911196911
AUROC: 0.7629669266783874


In [None]:
scores = df[["title", "score"]].dropna(subset=["score"])
X = scores.title
X = X.apply(clean_title)
y = ((scores.score - scores.score.mean()) / scores.score.std())
y = y > 0

vectorizer = CountVectorizer(min_df=5, ngram_range=(3, 4))
X = vectorizer.fit_transform(X)
print(X.shape)

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X, y)

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in clf.coef_[0].argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in (-clf.coef_[0]).argsort()[:10]]))

In [19]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [5]:
df = pd.read_csv("../data/posts.csv", index_col="id")

In [27]:
df.iloc[1].title

'[20M] What do you think of the Arabic /Russian mix ☺️?'

In [156]:
doc = nlp(clean_title(df.iloc[1].title))

In [158]:
displacy.render(doc)

In [66]:
frags

['do think', 'think', 'think of']

In [95]:

def extract_fragments(text):
    """
    Following the intuition that the
    bulk of this functional information is contained in
    the root of a question’s dependency parse along
    with its outgoing arcs (Iyyer et al., 2014a), we
    take the fragments of a question to be the root of
    its parse tree, along with each (root, child) pair.
    To capture cases when the operational word in
    the question is not connected to its root (such as
    “What...”), we also consider the initial unigram
    and bigram of a question as fragments. The following
    question has 5 fragments:
        what, what is, going→*, is←going and going→do
        
        What is the minister going to do about ... ?
    """
    doc = nlp(clean_title(text))
    if len(doc) == 0:
        return []
    fragments = [str(doc[0]), str(doc[:2])]
    # We take as NPs subtrees connected to the root with the
    # following: nsubj, nsubjpass, dobj, iobj, pobj, attr.

    np_deps = ["nsubj", "nsubjpass", "dobj", "iobj", "pobj", "attr"]
    root = None
    for tok in doc:

        if tok.head.dep_ == "ROOT" and tok.dep_ not in np_deps:
            if tok.dep_ == "ROOT":
                root = tok
                fragments.append(str(tok))
                continue
            if not root:
                fragments.append("{} {}".format(tok, tok.head))
                continue
            else:
                fragments.append("{} {}".format(tok.head, tok))
                continue
    return fragments

In [96]:
fragments = df.title.apply(extract_fragments)

In [97]:
fragments.head()

id
dh3ha2                                [How, Hows, How s, s]
dh3b61           [What, What do, do think, think, think of]
dh37sw    [Kinda, Kinda low, Kinda esteem, self esteem, ...
dh368y    [Got, Got a, Got are, are, are with, are some,...
dh32ch                             [RateMe, RateMe, RateMe]
Name: title, dtype: object

In [98]:
frag_list = []
for frags in fragments.to_numpy():
    frag_list += frags

In [106]:
frag_list = pd.Series(frag_list).str.strip().str.lower()

In [153]:
frag_list[frag_list.apply(lambda x: len(x.split()) >= 2)].value_counts()[:50]

rate me            8430
be honest          3735
just curious       2809
looking for        2688
please rate        1390
rate please        1155
please be          1146
do think           1083
rate and            911
give me             892
think do            889
do look             739
what do             693
honest opinions     640
just wondering      638
let know            626
rate my             603
always been         594
want know           577
just want           555
never had           521
just got            514
be please           511
m curious           492
how look            465
just looking        465
look do             460
curious about       429
curious think       428
am i                424
first time          416
not sure            410
how do              405
rate honestly       396
rate 110            380
i have              374
would like          356
would love          334
lost and            330
been told           330
’m curious          326
know do         