In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import time
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

data_path = '../raw_data/100k_data.csv'
df_all = pd.read_csv(data_path)
df_all = df_all.dropna()
df_sample = df_all.dropna().sample(10000,random_state=0)

In [3]:
def binary_cat_upvotes(original_df, threshold=30):
    """
    Takes column from df called 'upvotes' and returns df with new column
    'cat_upvotes' which is 1 if upvotes is above threshold, and 0 otherwise.
    """
    df = original_df.copy()
    if 'upvotes' not in original_df.columns:
        raise ValueError("df has no column named 'upvotes'")
    def trans(number):
        if number >= threshold:
            return 1
        else:
            return 0
    df['cat_upvotes'] = df['upvotes'].apply(trans)
    return df

In [6]:
df_new = df_all[['title','upvotes']]
df_new

Unnamed: 0,title,upvotes
0,i'm going to walmart,1
1,After suffering from 2 cherry eye look at how ...,16
2,Pets please,31
3,Enjoying some Ice Cream :-),13
4,What do you guys think of my grandma’s dog?,1
...,...,...
99260,It's so green here!,1019
99262,My vicious beast trying to catch his toy.,12
99263,"Kiley, 14 years old and more regal than ever.",139
99264,My boys Kylo (left) and Peanut (right) love St...,30


In [7]:
df_new = binary_cat_upvotes(df_new)
df_new

Unnamed: 0,title,upvotes,cat_upvotes
0,i'm going to walmart,1,0
1,After suffering from 2 cherry eye look at how ...,16,0
2,Pets please,31,1
3,Enjoying some Ice Cream :-),13,0
4,What do you guys think of my grandma’s dog?,1,0
...,...,...,...
99260,It's so green here!,1019,1
99262,My vicious beast trying to catch his toy.,12,0
99263,"Kiley, 14 years old and more regal than ever.",139,1
99264,My boys Kylo (left) and Peanut (right) love St...,30,1


In [8]:
df_new = df_new.drop(columns='upvotes')
df_new

Unnamed: 0,title,cat_upvotes
0,i'm going to walmart,0
1,After suffering from 2 cherry eye look at how ...,0
2,Pets please,1
3,Enjoying some Ice Cream :-),0
4,What do you guys think of my grandma’s dog?,0
...,...,...
99260,It's so green here!,1
99262,My vicious beast trying to catch his toy.,0
99263,"Kiley, 14 years old and more regal than ever.",1
99264,My boys Kylo (left) and Peanut (right) love St...,1


In [9]:
def title_len(string):
    return len(string)

df_new['title_len'] = df_new['title'].apply(title_len)
df_new

Unnamed: 0,title,cat_upvotes,title_len
0,i'm going to walmart,0,20
1,After suffering from 2 cherry eye look at how ...,0,74
2,Pets please,1,11
3,Enjoying some Ice Cream :-),0,27
4,What do you guys think of my grandma’s dog?,0,43
...,...,...,...
99260,It's so green here!,1,19
99262,My vicious beast trying to catch his toy.,0,41
99263,"Kiley, 14 years old and more regal than ever.",1,45
99264,My boys Kylo (left) and Peanut (right) love St...,1,56


In [12]:
pd.set_option('display.max_rows', None)
df_new.groupby('title_len').mean('cat_upvotes')

Unnamed: 0_level_0,cat_upvotes
title_len,Unnamed: 1_level_1
1,0.014706
2,0.022727
3,0.093264
4,0.219451
5,0.231707
6,0.224335
7,0.216599
8,0.224204
9,0.216783
10,0.266289
