In [11]:
import pandas as pd
import numpy as np
import scipy
import sklearn
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Download files, set up folder, put files into folder

In [5]:
# training data: ../data/train.csv
# test data:     ../data/test.csv

# Load data, split into training and validation sets

In [12]:
filepath = 'C:/data/funnyHeadlines/train.csv'
dataframe = pd.read_csv(filepath)
print(len(dataframe))
# print(dataframe)

7239


In [13]:
train_ratio = 0.7 # 70% for training, 30% for validation
random_seed = 100 # a fixed random seed allows fixed random runs (for controlled debugging). set to None to be random.

train_dataframe = dataframe.sample(frac= train_ratio, random_state=100) 
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))
# print(train_dataframe)

training set size: 5067
validation set size: 2172


## Also load test data (no splitting needed)

In [8]:
test_filepath = 'C:/data/funnyHeadlines/test.csv'
test_dataframe = pd.read_csv(test_filepath)
print('test set size:', len(test_dataframe))
# print(test_dataframe)

test set size: 2413


# Try the trivial baseline: always predicting the average meanGrade (of training data)

In [9]:
# take out prediction targets: mean grades 
train_Y = train_dataframe['meanGrade']
valid_Y = valid_dataframe['meanGrade']

In [10]:
# compute average of a list of numbers: np.mean
train_Y_avg = np.mean(train_dataframe['meanGrade'])
print('average meanGrade on training set:', train_Y_avg)

# make a list filled with train_Y_avg, essentially predicting the same number for all lines in validation set
avg_pred_valid = [train_Y_avg for i in range(len(valid_dataframe))]

# compute root mean squared error (RMSE) of this prediction on validation set
rmse = np.sqrt(mean_squared_error(valid_Y, avg_pred_valid))
print('RMSE on validation set:', rmse)

average meanGrade on training set: 0.94183277416
RMSE on validation set: 0.586252839148


In [15]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'pred'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))

In [16]:
# make a list filled with train_Y_avg, essentially predicting the same number for all lines in test set
avg_pred_test = [train_Y_avg for i in range(len(test_dataframe))]
write_test_prediction(test_dataframe, avg_pred_test, 'C:/data/690/average_constant_baseline.csv')

# Build feature extractor from training data (here we use a TFIDF extractor)

In [17]:
# get entire raw text in training corpus, including title and edit words (for learning vocabulary and IDF)
# params:
#     df: dataframe, with 'original' and 'edit' columns
# return:
#     corpus: a list of text strings, each is a concatenation of original text and edit word on each line

def get_raw_text(df):
    corpus = []
    for index, row in df.iterrows():
        title = row['original'].replace('<', '').replace('/>', '')
        edit = row['edit']
        corpus.append( title + ' ' + edit )
    return corpus

In [23]:
train_corpus = get_raw_text(train_dataframe)
#print (train_corpus)

# vectorizer = TfidfVectorizer(stop_words = None).fit(train_corpus)

vectorizer = CountVectorizer(stop_words = None).fit(train_corpus)
print (vectorizer.vocabulary_)



# Extract features of both training and validation data

In [24]:
# helper function: separate each title into (original_word, context), where context = title text without original word 
# params:
#     df: dataframe, with 'original' and 'edit' columns
# return:
#     original_words: a list of original word strings before editing
#     contexts:       a list of context strings 

def separate_original_word_from_title(df):
    original_words = []
    contexts = []
    for index, row in df.iterrows():
        title = row['original']
        start_position = title.find('<')
        end_position = title.find('/>')
        original_words.append(title[start_position+1 : end_position])
        contexts.append(title[:start_position] + title[end_position+2 :])
    return original_words, contexts

In [25]:
# construct sparse feature matrix
# params:
#     df: dataframe, with 'original' and 'edit' columns
#     vectorizer: sklearn text vectorizer, either TfidfVectorizer or Countvectorizer 
# return:
#     M: a sparse feature matrix that represents df's textual information (used by a predictive model)

def construct_feature_matrix(df, vectorizer):
    edit_words = df['edit'].tolist()
    
    # here the dimensionality of X is len(df) x |V|
    X = vectorizer.transform(edit_words)
    print (X.shape)
    
    return X

In [26]:
# Construct feature matrices for training and validation data
train_X = construct_feature_matrix(train_dataframe, vectorizer)
valid_X = construct_feature_matrix(valid_dataframe, vectorizer)
test_X = construct_feature_matrix(test_dataframe, vectorizer)

(5067, 8900)
(2172, 8900)
(2413, 8900)


# Train model on training set, evaluate model on validation set

In [30]:
# train a linear regression model. It's called "ridge regression" model here
# because it can further alleviates overfitting using so-called L2 regularization,
# with regularization alpha = 1

model = Ridge(alpha=1).fit(train_X, train_Y)
print (model.intercept_)
print (model.coef_)

0.896435866304
[ 0.  0.  0. ...,  0.  0.  0.]


In [28]:
# Evaluate model on validation set
valid_Y_hat = model.predict(valid_X)
rmse = np.sqrt(sklearn.metrics.mean_squared_error(valid_Y, valid_Y_hat))
print('RMSE on validation set:', rmse)

RMSE on validation set: 0.575431244311


In [31]:
# Evaluate model on training set: 
# expect to see unrealistically good performance! (for RMSE: lower is better)
# unrealistic because YOUR MODEL IS TRAINED ON EXACTLY THESE DATA!
# It gives the best validation/test performance you could hope to achieve using this model.

train_Y_hat = model.predict(train_X)
rmse = np.sqrt(sklearn.metrics.mean_squared_error(train_Y, train_Y_hat))
print('RMSE on training set:', rmse)

RMSE on training set: 0.391973861198


In [33]:
# apply the model on test data, write out prediction results to a csv file
test_Y_hat = model.predict(test_X)
write_test_prediction(test_dataframe, test_Y_hat, 'C:/data/690/ridge-regression_alpha=1_baseline.csv')

# Investigate what the model has learned and where it failed (A.K.A. error analysis)

In [34]:
# print(vectorizer.vocabulary_)

In [35]:
print(model.coef_)

[ 0.  0.  0. ...,  0.  0.  0.]


## Look at learned parameters (for linear model: weight of each dimension)

In [36]:
# construct a mapping: word -> learned weight of this word
feature_weight = {}
for word, idx in vectorizer.vocabulary_.items():
    feature_weight[word] = model.coef_[idx]
# print(feature_weight)

In [37]:
# words positively correlate with funniness (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = True)[:10]:
     print (k, v)

buttock 0.95178602346
bathe 0.951785784402
mistresses 0.951781114457
wig 0.935703566742
tanning 0.869041352986
dealer 0.851795831652
midlife 0.851787234764
spanks 0.851785768119
biceps 0.851785529764
sexy 0.851780780706


In [38]:
# words negatively correlate with funniness (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = False)[:10]:
     print (k, v)

sale -0.597627836919
opposition -0.597627363234
years -0.597615035739
border -0.597606361227
skip -0.572311613885
energy -0.557140024155
hates -0.53096868336
soups -0.530959598442
decision -0.530958791428
salmon -0.530953721575


## Look at how the model makes predictions on individual examples

# We pick a set of examples from the validation set (we predicted scores for those).
# We usually we don't pick from training data (since the good performance may be unrealistic).
# We cannot do error analysis on test data （because no true target value is provided）.

In [39]:
def explain_linear_prediction(df, model, idx2feature, X, Y, Y_hat, idx_list):
    print('indices:', idx_list)
    for idx in idx_list:
        print ('==============', idx, '================')
        print ('original:', df.iloc[idx]['original'])
        print ('edit:', df.iloc[idx]['edit'])
        print ('grades:', df.iloc[idx]['grades'])
        print ('TRUE score:', df.iloc[idx]['meanGrade'])
        print ('PRED score:', Y_hat[idx])
        
        print ('\nPRED breakdown:')
        print ('\tINTERCEPT', model.intercept_)
        if X[idx, :].nnz == 0:
            print ('\tFEATURE', '[EMPTY]')
        else:
            for entry in X[idx, :]: # looping over a row in sparse matrix 
                feature_value = entry.data[0]
                feature_dim = entry.indices[0]
                print ('\tFEATURE', idx2feature[feature_dim], ':', 'f_value', feature_value, '*', 'f_weight', model.coef_[feature_dim], '=', feature_value*model.coef_[feature_dim])
        

In [42]:
# construct a dictionary mapping: feature index -> word
idx2feature = dict([(v,k) for k,v in vectorizer.vocabulary_.items()])

errors = (valid_Y - valid_Y_hat)**2
# sort errors from low to high
sorted_errors = sorted(enumerate(errors.iloc[:].tolist()), key = lambda x: x[1], reverse = False)
# print(sorted_errors)

### prediction on random examples

In [43]:
# pick a random set of examples from validation set:
K = 5
random_indices = np.random.randint(0, valid_X.shape[0], K)
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X, valid_Y, valid_Y_hat, random_indices)

indices: [ 980 1115 1034  295    1]
original: Eric Trump : My <father/> has ' zero conflicts of interest '
edit: nose
grades: 22210
TRUE score: 1.4
PRED score: 0.716072509647

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE nose : f_value 1 * f_weight -0.180441242603 = -0.180441242603
original: Trump 's <State/> Department denies jobs to winners of prestigious scholarship for disadvantaged and minority students
edit: Nasty
grades: 31000
TRUE score: 0.8
PRED score: 0.896422996952

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE [EMPTY]
original: Trump ’s Muslim ban is no surprise : Our new president ’s agenda is fueled by white <nationalism/> 
edit: Chocolate
grades: 22220
TRUE score: 1.6
PRED score: 0.985199972572

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE chocolate : f_value 1 * f_weight 0.0887774815324 = 0.0887774815324
original: North Korea is ' on an aggressive schedule ' to <develop/> a ballistic missile submarine
edit: imagine
grades: 21100
TRUE score: 0.8
PRED sco

### examples with closest prediction

In [44]:
K = 5
# look at data with lowest prediction error
low_error_indices  = [i for i, v in sorted_errors[:K]]
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X, valid_Y, valid_Y_hat, low_error_indices)

indices: [2030, 1862, 999, 1063, 189]
original:  <Mexicans/> weigh the daunting prospect of deportee camps
edit: Cats
grades: 21100
TRUE score: 0.8
PRED score: 0.799512685315

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE cats : f_value 1 * f_weight -0.0969479383303 = -0.0969479383303
original: Obama 's $ 400,000 Wall Street speech is completely in <character/> ; Ask all the bankers he jailed for fraud .
edit: gibberish
grades: 32110
TRUE score: 1.4
PRED score: 1.39880331449

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE gibberish : f_value 1 * f_weight 0.502378497255 = 0.502378497255
original: Republican Lindsey Graham says firing Robert Mueller would be ‘ beginning of the end ’ of Donald Trump ’s <presidency/> 
edit: dictatorship
grades: 3311100000
TRUE score: 0.9
PRED score: 0.896422996952

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE dictatorship : f_value 1 * f_weight 0.0 = 0.0
original: Bottled <water/> is bullshit !
edit: cheese
grades: 31110
TRUE score: 1.2
PRED

### examples with worst predictions

In [45]:
K = 5
# look at data with highest prediction error
high_error_indices = [i for i, v in sorted_errors[-K:]]
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X, valid_Y, valid_Y_hat, high_error_indices)

indices: [2146, 2031, 665, 310, 198]
original: Trump to North Korean leader Kim : My ‘ Nuclear <Button/> ’ is ‘ much bigger &amp; more powerful ’
edit: Belly
grades: 33222
TRUE score: 2.4
PRED score: 0.648210474997

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE belly : f_value 1 * f_weight -0.24822413569 = -0.24822413569
original: Letting Obamacare Fail Would Break Trump 's <Oath/> 
edit: heart
grades: 33321
TRUE score: 2.4
PRED score: 0.632106812843

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE heart : f_value 1 * f_weight -0.264282949913 = -0.264282949913
original: Comey memo : Trump complained about Flynn ’s ‘ <judgment/> issues ’
edit: gas
grades: 32222
TRUE score: 2.2
PRED score: 0.424100940945

PRED breakdown:
	INTERCEPT 0.896435866304
	FEATURE gas : f_value 1 * f_weight -0.472329279449 = -0.472329279449
original: Charlotte Pence : I Bought The Gay <Bunny/> Book
edit: republican
grades: 33322
TRUE score: 2.6
PRED score: 0.748202962147

PRED breakdown:
	INTERCEPT 0.8964