In [1]:
import pandas as pd
import numpy as np

# Useful function for making training and testing sets
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [2]:
all_comments_labeled = pd.read_csv("comments_labeled.csv", sep='\t')

In [3]:
all_comments_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267945 entries, 0 to 267944
Data columns (total 19 columns):
label                 267945 non-null object
sub_title             267945 non-null object
sub_domain            267945 non-null object
sub_score             267945 non-null int64
sub_ratio             267945 non-null float64
sub_total_comments    267945 non-null int64
comment_id            267945 non-null object
submission_id         267945 non-null object
user_id               267945 non-null object
user_flair            267945 non-null object
upvotes               267945 non-null int64
controversiality      267945 non-null int64
gold                  267945 non-null int64
depth                 267945 non-null int64
parent_id             267945 non-null object
created               267945 non-null object
edited                267945 non-null int64
body                  267932 non-null object
deleted               267945 non-null int64
dtypes: float64(1), int64(8), object(10)


In [5]:
comments = all_comments_labeled.copy()
comments = comments.drop(comments[comments['body'].isnull()].index)

In [6]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267932 entries, 0 to 267944
Data columns (total 19 columns):
label                 267932 non-null object
sub_title             267932 non-null object
sub_domain            267932 non-null object
sub_score             267932 non-null int64
sub_ratio             267932 non-null float64
sub_total_comments    267932 non-null int64
comment_id            267932 non-null object
submission_id         267932 non-null object
user_id               267932 non-null object
user_flair            267932 non-null object
upvotes               267932 non-null int64
controversiality      267932 non-null int64
gold                  267932 non-null int64
depth                 267932 non-null int64
parent_id             267932 non-null object
created               267932 non-null object
edited                267932 non-null int64
body                  267932 non-null object
deleted               267932 non-null int64
dtypes: float64(1), int64(8), object(10)


In [8]:
# Named all the attributes. The only one not here is the label
# 'upvotes' is the current label, should be in num_attribs
num_attribs = ['sub_score','sub_ratio','sub_total_comments']
cat_attribs = ['sub_domain','user_flair','gold','depth','created']
bool_attribs = ['edited','deleted','controversiality']
id_attribs = ['submission_id','comment_id','user_id','parent_id']
str_attribs = ['sub_title','body']

In [12]:
# Drop the columns we won't use for regular predicition
def drop_non_data_col(ac):
    ac = ac.drop(id_attribs, axis=1)
    ac = ac.drop(str_attribs, axis=1)
    return ac

ac_data = drop_non_data_col(comments)
ac_data.head()

Unnamed: 0,label,sub_domain,sub_score,sub_ratio,sub_total_comments,user_flair,upvotes,controversiality,gold,depth,created,edited,deleted
0,News,globalnews.ca,68,0.9,108,,1,0,0,0,2017-07-17,0,0
1,News,globalnews.ca,68,0.9,108,,1,0,0,0,2017-07-17,0,0
2,News,globalnews.ca,68,0.9,108,,1,0,0,0,2017-07-17,0,0
3,News,globalnews.ca,68,0.9,108,,1,0,0,0,2017-07-17,0,0
4,News,globalnews.ca,68,0.9,108,,1,0,0,0,2017-07-17,0,0


In [13]:
ac_train, ac_test = split_train_test(ac_data,0.2)

In [14]:
ac_tr = ac_train.drop('upvotes',axis=1)
ac_tr_label = ac_train['upvotes'].copy()

In [15]:
ac_tr['edited'].value_counts()

0    203114
1     11232
Name: edited, dtype: int64

In [16]:
ac_tr['edited'].value_counts()

0    203114
1     11232
Name: edited, dtype: int64

In [17]:
ac_tr['controversiality'].value_counts()

0    197027
1     17319
Name: controversiality, dtype: int64

In [18]:
ac_tr['deleted'].value_counts()

0    206068
1      8278
Name: deleted, dtype: int64

The boolean values all seem to be worthwhile using.

In [20]:
for x in cat_attribs:
    print('-------------')
    print(ac_tr[x].value_counts())

-------------
cbc.ca                         45338
self.canada                    21459
theglobeandmail.com            14819
i.redd.it                      11147
globalnews.ca                   9290
thestar.com                     8366
ctvnews.ca                      6550
nationalpost.com                5666
imgur.com                       5372
i.imgur.com                     4660
youtube.com                     4571
macleans.ca                     4231
news.nationalpost.com           3995
twitter.com                     3927
huffingtonpost.ca               3256
thebeaverton.com                2968
business.financialpost.com      2934
vancouversun.com                2481
pbs.twimg.com                   2417
ottawacitizen.com               2339
ipolitics.ca                    1988
bloomberg.com                   1520
montrealgazette.com             1448
beta.theglobeandmail.com        1382
angusreid.org                   1360
nationalobserver.com            1299
youtu.be                

Gold seems to be useless, may want to drop it. Massive outliers.

sub_domain is very wild: large outliers that may need to be changed.

user_flair has one large outlier, but the rest are pretty close (this is correlated to the population of canadians)

depth is fine. Interesting that there are more replies then posted comments.

## Format the data

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [22]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline

domain_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[0])),
        ('label_binarizer', LabelBinarizer())
    ])
user_flair_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[1])),
        ('label_binarizer', LabelBinarizer())
    ])
gold_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[2])),
        ('label_binarizer', LabelBinarizer())
    ])
depth_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs[3])),
        ('label_binarizer', LabelBinarizer())
    ])

In [23]:
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler())
    ])

In [24]:
bool_pipeline = Pipeline([
        ('selector', DataFrameSelector(bool_attribs))
])

In [25]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("bool_pipepline",bool_pipeline),
        ("num_pipepline",num_pipeline),
        ("domain_pipeline", domain_pipeline),
        ("user_flair_pipeline", user_flair_pipeline),
        ("gold_pipeline", gold_pipeline),
        ("depth_pipeline", depth_pipeline),
    ])

In [26]:
ac_tr_prepared = full_pipeline.fit_transform(ac_train)

In [27]:
ac_tr_prepared.shape

(214346, 258)

In [28]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(ac_tr_prepared, ac_tr_label)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
some_data = ac_tr.iloc[:5]
some_labels = ac_tr_label.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

predictions:	 [  1.95736694   2.89160156   6.73477173   3.56982422  19.75454712]
Labels:		 [6, 15, 5, 18, 3]


In [25]:
from sklearn.metrics import mean_squared_error

ac_predictions = lin_reg.predict(ac_tr_prepared)
lin_mse = mean_squared_error(ac_tr_label, ac_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

43.463991464096416

In [26]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(ac_tr_prepared, ac_tr_label)

ac_predictions = tree_reg.predict(ac_tr_prepared)
tree_mse = mean_squared_error(ac_tr_label, ac_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

36.435088431477709

In [29]:
some_data = ac_tr.iloc[:10]
some_labels = ac_tr_label.iloc[:10]
some_data_prepared = full_pipeline.transform(some_data)
print("predictions:\t", tree_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

predictions:	 [  2.6         15.           3.5          9.33333333   2.5         10.14285714
   5.61538462   0.25        17.83333333 -16.        ]
Labels:		 [6, 15, 5, 18, 3, 17, 2, 1, 43, -16]


## Search for parameters

In [28]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

fores_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

NameError: name 'RandomForestRegressor' is not defined