In [2]:
import pandas as pd
import numpy as np

# Useful function for making training and testing sets
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [3]:
all_comments_labeled = pd.read_csv("comments_labeled.csv", sep='\t')
# all_comments = pd.read_csv("datasets/reddit/comments.csv", sep='\t')
# all_subs = pd.read_csv("datasets/reddit/submissions.csv", sep='\t')

In [4]:
# I change this before processsing because it's too hard to do in numpy later.
# Really, these things should be figured out in the extraction step anyway.
all_comments_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250112 entries, 0 to 250111
Data columns (total 19 columns):
label                 250112 non-null object
sub_title             250112 non-null object
sub_domain            250112 non-null object
sub_score             250112 non-null int64
sub_ratio             250112 non-null float64
sub_total_comments    250112 non-null int64
comment_id            250112 non-null object
submission_id         250112 non-null object
user_id               250112 non-null object
user_flair            250112 non-null object
upvotes               250112 non-null int64
controversiality      250112 non-null int64
gold                  250112 non-null int64
depth                 250112 non-null int64
parent_id             250112 non-null object
created               250112 non-null object
edited                13267 non-null object
body                  250099 non-null object
deleted               250112 non-null int64
dtypes: float64(1), int64(7), object(11)


In [5]:
comments = all_comments_labeled.copy()
comments['edited'] = pd.notnull(comments['edited']).map({True:1,False:0}).astype(int)
comments = comments.drop(comments[comments['body'].isnull()].index)

In [6]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250099 entries, 0 to 250111
Data columns (total 19 columns):
label                 250099 non-null object
sub_title             250099 non-null object
sub_domain            250099 non-null object
sub_score             250099 non-null int64
sub_ratio             250099 non-null float64
sub_total_comments    250099 non-null int64
comment_id            250099 non-null object
submission_id         250099 non-null object
user_id               250099 non-null object
user_flair            250099 non-null object
upvotes               250099 non-null int64
controversiality      250099 non-null int64
gold                  250099 non-null int64
depth                 250099 non-null int64
parent_id             250099 non-null object
created               250099 non-null object
edited                250099 non-null int32
body                  250099 non-null object
deleted               250099 non-null int64
dtypes: float64(1), int32(1), int64(7), o

In [7]:
# Named all the attributes. The only one not here is the label
# 'upvotes' is the current label, should be in num_attribs
# 'created' is not included
num_attribs = ['sub_score','sub_ratio','sub_total_comments']
cat_attribs = ['sub_domain','user_flair','gold','depth']
bool_attribs = ['edited','deleted','controversiality']
id_attribs = ['submission_id','comment_id','user_id','parent_id']
str_attribs = ['sub_title','body']

In [8]:
# Drop the columns we won't use for regular predicition
def drop_non_data_col(ac):
    ac = ac.drop(id_attribs, axis=1)
    ac = ac.drop(str_attribs, axis=1)
    ac = ac.drop(['created'], axis=1)
    return ac

ac_data = drop_non_data_col(comments)
ac_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250099 entries, 0 to 250111
Data columns (total 12 columns):
label                 250099 non-null object
sub_domain            250099 non-null object
sub_score             250099 non-null int64
sub_ratio             250099 non-null float64
sub_total_comments    250099 non-null int64
user_flair            250099 non-null object
upvotes               250099 non-null int64
controversiality      250099 non-null int64
gold                  250099 non-null int64
depth                 250099 non-null int64
edited                250099 non-null int32
deleted               250099 non-null int64
dtypes: float64(1), int32(1), int64(7), object(3)
memory usage: 23.9+ MB


In [9]:
ac_train, ac_test = split_train_test(ac_data,0.2)

In [10]:
ac_tr = ac_train.drop('upvotes',axis=1)
ac_tr_label = ac_train['upvotes'].copy()

## Split data after this

Don't look at test data at all once all the data has been imported and tested properly

In [11]:
ac_tr['edited'].value_counts()

0    189435
1     10645
Name: edited, dtype: int64

In [12]:
ac_tr['edited'].value_counts()

0    189435
1     10645
Name: edited, dtype: int64

In [13]:
ac_tr['controversiality'].value_counts()

0    183924
1     16156
Name: controversiality, dtype: int64

In [14]:
ac_tr['deleted'].value_counts()

0    192086
1      7994
Name: deleted, dtype: int64

The boolean values all seem to be worthwhile using.

In [15]:
for x in cat_attribs:
    print('-------------')
    print(ac_tr[x].value_counts())

-------------
cbc.ca                         42076
self.canada                    20567
theglobeandmail.com            14608
i.redd.it                       9433
globalnews.ca                   8724
thestar.com                     7869
ctvnews.ca                      5703
imgur.com                       4923
nationalpost.com                4554
youtube.com                     4532
macleans.ca                     4162
news.nationalpost.com           3987
i.imgur.com                     3980
twitter.com                     3918
huffingtonpost.ca               3241
thebeaverton.com                2883
business.financialpost.com      2843
ottawacitizen.com               2327
pbs.twimg.com                   2324
vancouversun.com                2251
ipolitics.ca                    1775
bloomberg.com                   1549
beta.theglobeandmail.com        1364
montrealgazette.com             1349
youtu.be                        1269
angusreid.org                   1253
pm.gc.ca                

Gold seems to be useless, may want to drop it. Massive outliers.

sub_domain is very wild: large outliers that may need to be changed.

user_flair has one large outlier, but the rest are pretty close (this is correlated to the population of canadians)

depth is fine. Interesting that there are more replies then posted comments.

## Format the data

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [17]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline

domain_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[0])),
        ('label_binarizer', LabelBinarizer())
    ])
user_flair_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[1])),
        ('label_binarizer', LabelBinarizer())
    ])
gold_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[2])),
        ('label_binarizer', LabelBinarizer())
    ])
depth_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[3])),
        ('label_binarizer', LabelBinarizer())
    ])

In [18]:
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler())
    ])

In [19]:
bool_pipeline = Pipeline([
        ('selector', DataFrameSelector(bool_attribs))
])

In [20]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("bool_pipepline",bool_pipeline),
        ("num_pipepline",num_pipeline),
        ("domain_pipeline", domain_pipeline),
        ("user_flair_pipeline", user_flair_pipeline),
        ("gold_pipeline", gold_pipeline),
        ("depth_pipeline", depth_pipeline),
    ])

In [21]:
ac_tr_prepared = full_pipeline.fit_transform(ac_train)
ac_tr_prepared

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [22]:
ac_tr_prepared.shape

(200080, 249)

In [23]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(ac_tr_prepared, ac_tr_label)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
some_data = ac_tr.iloc[:5]
some_labels = ac_tr_label.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

predictions:	 [  1.95736694   2.89160156   6.73477173   3.56982422  19.75454712]
Labels:		 [6, 15, 5, 18, 3]


In [25]:
from sklearn.metrics import mean_squared_error

ac_predictions = lin_reg.predict(ac_tr_prepared)
lin_mse = mean_squared_error(ac_tr_label, ac_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

43.463991464096416

In [26]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(ac_tr_prepared, ac_tr_label)

ac_predictions = tree_reg.predict(ac_tr_prepared)
tree_mse = mean_squared_error(ac_tr_label, ac_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

36.435088431477709

In [29]:
some_data = ac_tr.iloc[:10]
some_labels = ac_tr_label.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("predictions:\t", tree_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

predictions:	 [  2.6         15.           3.5          9.33333333   2.5         10.14285714
   5.61538462   0.25        17.83333333 -16.        ]
Labels:		 [6, 15, 5, 18, 3, 17, 2, 1, 43, -16]


## Search for parameters

In [28]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

fores_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

NameError: name 'RandomForestRegressor' is not defined