In [42]:
import re
import pandas as pd
import numpy as np
import emoji
import string
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
def clean_title(x):
    tag = r"([\s\(\)\[\],\/-]*\b((\d{1,2})[\s\(\)\[\],\/-]*([mf]|female|male))\b[\s\(\)\[\],\/-]*|\b[\s\(\)\[\],\/-]*(([mf]|female|male)[\s\(\)\[\],\/-]*(\d{1,2}))\b)"
    age = r"\d{1,2}"
    gender = r"(\b|\d)([mMfF])"
    match = re.sub(tag, '', x, flags=re.I)
    match = emoji.get_emoji_regexp().sub(r'', match)
    match = match.translate(str.maketrans('', '', string.punctuation))
    return match

In [43]:
df = pd.read_csv("../data/all_posts.csv", index_col="id")
ratings = df[["title", "average_rating"]].dropna(subset=["average_rating"])

X = ratings.title
X = X.apply(clean_title)
y = ((ratings.average_rating - ratings.average_rating.mean()) / ratings.average_rating.std())
y = y > 0

vectorizer = CountVectorizer(min_df=5, ngram_range=(3, 4))
X = vectorizer.fit_transform(X)
print(X.shape)

(93444, 17598)


In [44]:
from sklearn.linear_model import LinearRegression, Ridge

In [45]:
reg = LinearRegression().fit(X, y)
print(reg.score(X, y))
print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], reg.coef_[i])) for i in reg.coef_.argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], reg.coef_[i])) for i in (-reg.coef_).argsort()[:10]]))


0.18329785002861387
most negative
('too old to be', -2.386560932893037)
('been confident in my', -2.0687785410986)
('to 10 where', -1.9818034446777986)
('of my mind', -1.9175953447576555)
('love some opinions', -1.8829268356040516)
('idea where im', -1.8547945632267673)
('be honest looking for', -1.837554806436986)
('come across to', -1.7856437161235565)
('just like everyone', -1.7430371696061298)
('what am working with', -1.7389873658286346)
most positive
('the one in the', 2.108853163317652)
('from to 10 where', 2.052526811839095)
('curious just like everyone', 1.9293187742632634)
('didnt get any responses', 1.9001685550140879)
('me also beard', 1.8393692423026138)
('of year relationship', 1.7577413358757115)
('do stand on', 1.7398420985621876)
('never thought of myself', 1.7218931071038983)
('looks out of 10', 1.7116420298801946)
('for any suggestions', 1.7031411733182118)


In [46]:
ridge = Ridge().fit(X, y)
print(ridge.score(X, y))

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], ridge.coef_[i])) for i in ridge.coef_.argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], ridge.coef_[i])) for i in (-ridge.coef_).argsort()[:10]]))


0.16586406381299823
most negative
('should keep the facial', -0.6736803094510072)
('months and want', -0.6562664256432262)
('any advice and', -0.6371390861370904)
('much luck on tinder', -0.5983626687224751)
('the process of losing', -0.5972198090455605)
('id give this shot', -0.5917366322780354)
('critical as possible', -0.5754834802589426)
('ratings would be appreciated', -0.5746687523542411)
('as can be', -0.5736031001647823)
('too old to be', -0.5622066903955212)
most positive
('love to get some', 0.6750423659072798)
('just went through break', 0.6717121144050228)
('wondering if should keep', 0.6663041924420904)
('really need to know', 0.6492171895642987)
('for too long', 0.6253323913770503)
('but was curious', 0.6223437998693262)
('im the guy on', 0.6190783683174077)
('just curious any suggestions', 0.6030674784994611)
('heading off to college', 0.5914939516525364)
('please be kind but', 0.587187329162937)


In [47]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X, y)

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in clf.coef_[0].argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in (-clf.coef_[0]).argsort()[:10]]))

most negative
('much luck on tinder', -2.027531948724353)
('should keep the facial', -1.8380284902942767)
('too old to be', -1.8277826055832151)
('where fall in', -1.7801503736040174)
('any advice and', -1.731995396997816)
('what league im in', -1.7183002936461433)
('the process of losing', -1.7059620174873744)
('months and want', -1.7030400295771904)
('looks or not', -1.6725544172676103)
('what should fix', -1.5864952240397696)
most positive
('my first date', 2.030433813530835)
('stand be honest', 1.973880594709696)
('girlfriend dumped me', 1.9408872817446576)
('wondering if should keep', 1.9191990039870683)
('maybe some advice on', 1.914586809446145)
('always been curious of', 1.9045255544205408)
('think im just', 1.8175592103540836)
('really need to know', 1.7856043687590302)
('but was curious', 1.7610920489493844)
('the outside world', 1.7564948219819096)


In [48]:
from sklearn.metrics import *

print("Accuracy:", accuracy_score(y, clf.predict(X)))
print("AUROC:", roc_auc_score(y, clf.decision_function(X)))

Accuracy: 0.6772505457814306
AUROC: 0.7332552429416533


In [49]:
scores = df[["title", "score"]].dropna(subset=["score"])
X = scores.title
X = X.apply(clean_title)
y = np.log((scores.score - scores.score.mean()) / scores.score.std())
y = y > 0

vectorizer = CountVectorizer(min_df=5, ngram_range=(3, 4))
X = vectorizer.fit_transform(X)
print(X.shape)

  result = getattr(ufunc, method)(*inputs, **kwargs)


(218000, 45072)


In [50]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X, y)

print("most negative")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in clf.coef_[0].argsort()[:10]]))
print("most positive")
print("\n".join([str((vectorizer.get_feature_names()[i], clf.coef_[0][i])) for i in (-clf.coef_[0]).argsort()[:10]]))

most negative
('feeling like myself', -1.5286725612312713)
('got dumped after years', -1.4218421282327847)
('some photos of', -1.2643778274539603)
('advice would be useful', -1.2041068706703506)
('rate me amp be', -1.1602074659239865)
('of want to know', -1.1374138381797005)
('and wanted some', -1.1247105003846514)
('and curious for', -1.1225102106773828)
('id try this out', -1.121505947484366)
('fluctuating self esteem', -1.11390433123655)
most positive
('ago but have', 1.5348732929438653)
('ive gone from', 1.4022525416416947)
('my makeup and', 1.3945819668096722)
('pretty down about', 1.393117933151576)
('if its harsh', 1.3883864844949974)
('and feel pretty', 1.380859237611832)
('like how look in', 1.3620316556842618)
('im curious also', 1.3605044427934234)
('curly vs straight', 1.336835180748292)
('the time but', 1.3280336104322887)


In [28]:
scores.score.describe()

count    79000.000000
mean         4.913671
std         29.860329
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max       1155.000000
Name: score, dtype: float64

In [19]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [5]:
df = pd.read_csv("../data/posts.csv", index_col="id")

In [27]:
df.iloc[1].title

'[20M] What do you think of the Arabic /Russian mix ☺️?'

In [156]:
doc = nlp(clean_title(df.iloc[1].title))

In [158]:
displacy.render(doc)

In [66]:
frags

['do think', 'think', 'think of']

In [95]:

def extract_fragments(text):
    """
    Following the intuition that the
    bulk of this functional information is contained in
    the root of a question’s dependency parse along
    with its outgoing arcs (Iyyer et al., 2014a), we
    take the fragments of a question to be the root of
    its parse tree, along with each (root, child) pair.
    To capture cases when the operational word in
    the question is not connected to its root (such as
    “What...”), we also consider the initial unigram
    and bigram of a question as fragments. The following
    question has 5 fragments:
        what, what is, going→*, is←going and going→do
        
        What is the minister going to do about ... ?
    """
    doc = nlp(clean_title(text))
    if len(doc) == 0:
        return []
    fragments = [str(doc[0]), str(doc[:2])]
    # We take as NPs subtrees connected to the root with the
    # following: nsubj, nsubjpass, dobj, iobj, pobj, attr.

    np_deps = ["nsubj", "nsubjpass", "dobj", "iobj", "pobj", "attr"]
    root = None
    for tok in doc:

        if tok.head.dep_ == "ROOT" and tok.dep_ not in np_deps:
            if tok.dep_ == "ROOT":
                root = tok
                fragments.append(str(tok))
                continue
            if not root:
                fragments.append("{} {}".format(tok, tok.head))
                continue
            else:
                fragments.append("{} {}".format(tok.head, tok))
                continue
    return fragments

In [96]:
fragments = df.title.apply(extract_fragments)

In [97]:
fragments.head()

id
dh3ha2                                [How, Hows, How s, s]
dh3b61           [What, What do, do think, think, think of]
dh37sw    [Kinda, Kinda low, Kinda esteem, self esteem, ...
dh368y    [Got, Got a, Got are, are, are with, are some,...
dh32ch                             [RateMe, RateMe, RateMe]
Name: title, dtype: object

In [98]:
frag_list = []
for frags in fragments.to_numpy():
    frag_list += frags

In [106]:
frag_list = pd.Series(frag_list).str.strip().str.lower()

In [153]:
frag_list[frag_list.apply(lambda x: len(x.split()) >= 2)].value_counts()[:50]

rate me            8430
be honest          3735
just curious       2809
looking for        2688
please rate        1390
rate please        1155
please be          1146
do think           1083
rate and            911
give me             892
think do            889
do look             739
what do             693
honest opinions     640
just wondering      638
let know            626
rate my             603
always been         594
want know           577
just want           555
never had           521
just got            514
be please           511
m curious           492
how look            465
just looking        465
look do             460
curious about       429
curious think       428
am i                424
first time          416
not sure            410
how do              405
rate honestly       396
rate 110            380
i have              374
would like          356
would love          334
lost and            330
been told           330
’m curious          326
know do         