In [127]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline
from sklearn.linear_model import LogisticRegression

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipies or reviews (as opposed to current events)
- Look at some examples

In [128]:
data = pd.read_csv('../../assets/datasets/train.tsv', sep='\t', na_values='?')

# Extract the title and body from the boilerplate JSON text
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

In [129]:
data[['title', 'label']].head()

Unnamed: 0,title,label
0,"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries",0
1,"The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races",1
2,Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health,1
3,10 Foolproof Tips for Better Sleep,1
4,The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen,0


#### Build a feature matrix where X contains useful features and y contains the target variable `label`

In [142]:
import os
os.getcwd()

'/Users/ugp/DSI-CHI-1/lessons/week-04/3.4-model-fit-and-sklearn-logistic/code/starter-code'

In [143]:
data.describe()

Unnamed: 0,urlid,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
count,7395.0,5053.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0,4552.0,7395.0,7395.0,6147.0,7395.0,7395.0,7395.0,7395.0,7395.0,7395.0
mean,5305.704665,0.603334,2.761823,0.46823,0.21408,0.092062,0.049262,2.255103,-0.10375,0.0,0.056423,0.021231,0.233778,0.275709,1.0,0.660311,30.077079,0.047828,5716.598242,178.754564,4.960649,0.172864,0.101221,0.51332
std,3048.384114,0.212864,8.619793,0.203133,0.146743,0.095978,0.072629,5.704313,0.306545,0.0,0.041446,0.144162,0.052487,1.91932,0.0,0.473636,20.393101,0.21342,8875.43243,179.466198,3.233111,0.183286,0.079231,0.499856
min,1.0,0.070833,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.045564,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2688.5,,1.602062,0.34037,0.105263,0.022222,0.0,0.442616,0.0,0.0,0.028502,0.0,0.201061,0.0259,,0.0,14.0,,1579.0,82.0,3.0,0.040984,0.068739,0.0
50%,5304.0,,2.088235,0.481481,0.202454,0.068627,0.022222,0.48368,0.0,0.0,0.045775,0.0,0.230564,0.083051,,1.0,25.0,,3500.0,139.0,5.0,0.113402,0.089312,1.0
75%,7946.5,,2.627451,0.616604,0.3,0.133333,0.065065,0.578227,0.0,0.0,0.073459,0.0,0.26077,0.2367,,1.0,43.0,,6377.0,222.0,7.0,0.241299,0.112376,1.0
max,10566.0,0.999426,363.0,1.0,1.0,0.980392,0.980392,21.0,0.25,0.0,0.444444,1.0,0.716883,113.333333,1.0,1.0,100.0,1.0,207952.0,4997.0,22.0,1.0,1.0,1.0


In [144]:
data = data[np.isfinite(data['news_front_page'])]
data = data[np.isfinite(data['is_news'])]
data = data[np.isfinite(data['alchemy_category_score'])]
data.describe()

Unnamed: 0,urlid,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
count,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0,3566.0
mean,5373.242569,0.608843,2.36182,0.480185,0.222428,0.097553,0.054697,1.236298,-0.056567,0.0,0.058061,0.019349,0.232715,0.234122,1.0,0.720415,29.998598,0.055805,5870.556366,185.278744,5.323331,0.16446,0.096689,0.517667
std,3074.384342,0.212684,3.502241,0.184105,0.138361,0.093458,0.073537,3.803542,0.232502,0.0,0.039592,0.137769,0.043971,1.169576,0.0,0.448858,19.351194,0.229577,7497.752539,154.052199,3.241569,0.162414,0.053385,0.499758
min,5.0,0.075,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.075224,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,2696.25,0.460736,1.640449,0.363636,0.12191,0.029126,0.0,0.439223,0.0,0.0,0.031682,0.0,0.204238,0.033561,1.0,0.0,15.0,0.0,1821.0,93.0,3.0,0.045918,0.071806,0.0
50%,5442.5,0.631032,2.1,0.491427,0.21413,0.077754,0.028856,0.475962,0.0,0.0,0.047908,0.0,0.23065,0.085928,1.0,1.0,26.0,0.0,3834.5,147.0,5.0,0.117483,0.089765,1.0
75%,8046.75,0.787194,2.646896,0.612397,0.301518,0.138889,0.07517,0.529072,0.0,0.0,0.074576,0.0,0.2584,0.222761,1.0,1.0,43.0,0.0,6798.5,229.0,7.0,0.22755,0.109997,1.0
max,10566.0,0.999426,161.538462,1.0,0.982906,0.948718,0.948718,21.0,0.214286,0.0,0.421053,1.0,0.506146,52.0,1.0,1.0,100.0,1.0,87948.0,3283.0,21.0,1.0,1.0,1.0


In [145]:
logit = LogisticRegression()

In [146]:
y = data['label']
x = data[['alchemy_category_score','avglinksize','commonlinkratio_1','commonlinkratio_2','commonlinkratio_3',
         'commonlinkratio_4','compression_ratio','embed_ratio','framebased','frameTagRatio','hasDomainLink',
         'html_ratio','image_ratio','is_news','lengthyLinkDomain','linkwordscore','news_front_page',
         'non_markup_alphanum_characters','numberOfLinks','numwords_in_url','parametrizedLinkRatio',
         'spelling_errors_ratio']]

In [147]:
x

Unnamed: 0,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio
0,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.000000,0,0.090774,0,0.245831,0.003883,1.0,1,24,0.0,5424,170,8,0.152941,0.079130
1,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.000000,0,0.098707,0,0.203490,0.088652,1.0,1,40,0.0,4973,187,9,0.181818,0.125448
2,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.000000,0,0.072448,0,0.226402,0.120536,1.0,1,55,0.0,2240,258,11,0.166667,0.057613
3,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.000000,0,0.095861,0,0.265656,0.035343,1.0,0,24,0.0,2737,120,5,0.041667,0.100858
4,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.000000,0,0.024908,0,0.228887,0.050473,1.0,1,14,0.0,12032,162,10,0.098765,0.082569
6,0.221110,0.773810,0.215054,0.053763,0.043011,0.043011,0.579596,0.000000,0,0.039568,0,0.218978,0.311377,1.0,0,21,0.0,1287,93,3,0.548387,0.064327
10,0.816604,2.506527,0.637755,0.293367,0.091837,0.048469,0.592322,0.000000,0,0.056497,0,0.223004,0.511364,1.0,1,53,0.0,4401,392,0,0.160714,0.073684
12,0.872323,3.056911,0.595588,0.227941,0.044118,0.014706,0.573109,0.000000,0,0.074576,0,0.236281,0.084112,1.0,1,64,0.0,1062,136,9,0.169118,0.180328
16,0.598149,0.929825,0.068966,0.000000,0.000000,0.000000,0.478355,0.000000,0,0.157576,0,0.310009,0.276316,1.0,0,50,0.0,268,58,2,0.137931,0.107143
17,0.772920,2.083333,0.421053,0.178947,0.042105,0.000000,0.462995,0.000000,0,0.099778,0,0.232048,0.275862,1.0,0,49,0.0,852,95,8,0.305263,0.139738


In [148]:
y

0       0
1       1
2       1
3       1
4       0
6       1
10      0
12      1
16      0
17      0
18      1
19      1
20      0
25      0
26      0
31      1
34      0
37      0
41      1
43      1
45      0
50      0
51      1
53      1
54      1
56      1
58      0
59      1
61      0
62      1
       ..
7329    0
7331    0
7340    1
7341    0
7342    1
7346    0
7347    1
7348    0
7349    1
7351    0
7352    0
7354    1
7355    1
7359    0
7360    1
7362    0
7364    0
7372    0
7373    0
7374    0
7375    0
7377    0
7379    1
7382    1
7383    1
7387    1
7388    0
7390    0
7391    0
7393    1
Name: label, dtype: int64

In [154]:
# To find accuracy!
model = logit.fit(x, y)
print 'This is our accuracy ' 
print model.score(x, y) # This is your accuracy!

from sklearn.metrics import confusion_matrix, classification_report

# # Confusion Matrix!

# y_pred = model.predict(x)
# confusion = np.array(confusion_matrix(y, y_pred)) 



# # If you want a pretty confusion matrix:

conmat = np.array(confusion_matrix(y, y_pred))

confusion = pd.DataFrame(conmat, index=['Y=0', 'Y=1'],
                         columns=['Y-hat = 0', 'Y-hat = 1'])

print '\n''This is our confusion matrix'
print(confusion)

# # Precision and Recall!

'\n'
print 'This is our Classification Report'
print classification_report(y, y_pred) 


This is our accuracy 
0.61862030286

This is our confusion matrix
     Y-hat = 0  Y-hat = 1
Y=0        884        836
Y=1        524       1322
             precision    recall  f1-score   support

          0       0.63      0.51      0.57      1720
          1       0.61      0.72      0.66      1846

avg / total       0.62      0.62      0.61      3566



In [150]:
logit.fit(x, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [151]:
logit.score(x, y)

0.61862030286034775

In [152]:
predictions = logit.predict(x)
predictions[0:100]

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0])

In [153]:
predict_proba = logit.predict_proba(x)
predict_proba[0:10]

array([[ 0.46288971,  0.53711029],
       [ 0.57160153,  0.42839847],
       [ 0.64651521,  0.35348479],
       [ 0.48667062,  0.51332938],
       [ 0.45725389,  0.54274611],
       [ 0.46445781,  0.53554219],
       [ 0.47519875,  0.52480125],
       [ 0.70366818,  0.29633182],
       [ 0.63266544,  0.36733456],
       [ 0.6712944 ,  0.3287056 ]])

#### Build a Logistic Regression model using scikit-learn and then!
- Examine the coefficients using the `examine_coefficients` function provided
- Evaluate the AUC, precision and recall of the model using cross-validation
- Plot the ROC curve
- Iterate on the model by adding in additional features in `X` above

In [120]:
def examine_coefficients(model, df):
    return pd.data(
        { 'Coefficient' : model.coef_[0] , 'Feature' : df.columns}
    ).sort_values(by='Coefficient')

In [6]:
# TODO