In [22]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib notebook

sns.set_style("darkgrid")

In [16]:
posts = pd.read_csv("../data/all_posts.csv", index_col="id")

In [17]:
posts.sample(5)

Unnamed: 0_level_0,author,full_link,num_comments,title,gilded,score,selftext,age,gender,average_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8a67r4,__tom_hanks__,https://www.reddit.com/r/Rateme/comments/8a67r...,7.0,M 28.,0.0,1.0,,28.0,M,5.75
5g1w88,[deleted],https://www.reddit.com/r/Rateme/comments/5g1w8...,2.0,(18M) go for it!,0.0,3.0,[deleted],18.0,M,6.0
2p7i55,[deleted],https://www.reddit.com/r/Rateme/comments/2p7i5...,0.0,[M19] Let's try this again.,0.0,1.0,Hi guys.\nThis is the album: http://imgur.com/...,19.0,M,
707tqh,SoNotMarshall,https://www.reddit.com/r/Rateme/comments/707tq...,3.0,20 [M] it's a boring Thursday night and I'm cu...,,1.0,http://imgur.com/a/wfJC5 \n\nI'm 20M in colleg...,20.0,M,8.0
3zk1w9,[deleted],https://www.reddit.com/r/Rateme/comments/3zk1w...,23.0,[20F] Nigerian girl here,0.0,58.0,[deleted],20.0,F,7.594444


In [18]:
gt5 = posts[posts.num_comments > 5]

In [19]:
posts.gender = posts.gender.str.upper()

# Age Distribution

Very much exponential distribution. You can see when we plot log scale, the range from 20 to 60 is pretty much linearly decreasing.

In [25]:
age_gender_posts = posts.dropna(subset=["age", "gender"])
age_gender_posts = age_gender_posts[age_gender_posts.age <= 50]
age_gender_posts["age"].plot.hist(bins=30, title="Age Distribution (Log Scale)", log=True)


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1260029d0>

In [26]:
age_gender_posts.plot.scatter(x="age", y="average_rating")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12886fa10>

In [27]:
sns.scatterplot(list(age_gender_posts.groupby("age").indices.keys()),
                age_gender_posts.groupby("age").average_rating.agg("median")
               ).set_title("Average Rating by Age")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Average Rating by Age')

# Rating Distribution

In [28]:
rating_posts = posts.dropna(subset=["average_rating"])

In [29]:
rating_posts.average_rating.plot.hist(title="Rating Distribution")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12889ed10>

In [30]:
genders = rating_posts.groupby("gender")
fig, ax = plt.subplots()
ax.hist([genders.get_group("M").average_rating, genders.get_group("F").average_rating],
         stacked=True,
         bins=20, label=["M", "F"])
ax.legend()
ax.set_xlabel("Rating")
ax.set_ylabel("Frequency")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Frequency')

In [31]:
colors = {"M": "blue", "F": "yellow"}
rating_posts.plot.scatter(x="average_rating",
                          y="score",
                          color=[colors[m] if m in colors else "black" for m in rating_posts.gender],
                          alpha=0.4)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x129dd0710>

The arc of high scoring / low rating corresponds to progress p

In [32]:
rating_posts = rating_posts[rating_posts.age <= 50]
rating_posts.plot.scatter(x="age",
                          y="score",
                          color=[colors[m] if m in colors else "black" for m in rating_posts.gender],
                          alpha=0.4)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x125d48890>

Roughly, higher-scoring males tend to be older than higher-scoring females.

In [33]:
rating_posts.groupby("gender").plot.scatter(x="average_rating", y="score")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

gender
F    AxesSubplot(0.125,0.11;0.775x0.77)
M    AxesSubplot(0.125,0.11;0.775x0.77)
dtype: object

In [34]:
rating_posts.groupby("gender").average_rating.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,26886.0,6.455179,1.231987,0.0,5.891304,6.666667,7.27128,10.0
M,61360.0,6.109193,1.594242,0.0,5.2,6.4,7.25,9.99


In [35]:
rating_posts.groupby("gender").score.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,26886.0,27.828684,56.91716,0.0,2.0,9.0,29.0,1155.0
M,61360.0,3.568008,16.697968,0.0,1.0,1.0,3.0,1786.0


In [38]:
plt.figure()
rating_posts[rating_posts.score > 1].groupby("gender")["score"].plot.hist(bins=30, log=True, legend=True)

<IPython.core.display.Javascript object>

gender
F    AxesSubplot(0.125,0.11;0.775x0.77)
M    AxesSubplot(0.125,0.11;0.775x0.77)
Name: score, dtype: object

In [118]:
import webbrowser

webbrowser.open(rating_posts.loc[42604].full_link)

True

In [111]:
comments = pd.read_csv("../data/comments.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [115]:
comments.loc[159440].body

'Speaking objectively: 9.5/10 :) \n\nSpeaking as a fan of nose rings: 10/10 :)'

In [147]:
posts[(posts.average_rating < 5) &(posts.score > 600)].iloc[0]

id                                                                  bxu93p
author                                                             HY4ever
full_link                https://www.reddit.com/r/Rateme/comments/bxu93...
num_comments                                                           125
title                    Im a 25 year old 4' 10" mom who recently lost ...
gilded                                                                 NaN
score                                                                  734
selftext                                                               NaN
total_awards_received                                                    0
age                                                                    NaN
gender                                                                 NaN
average_rating                                                     4.30165
Name: 18131, dtype: object