## Linear Regression implementation 

In [54]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
import random

In [55]:
user_data = pd.read_csv('data/user_data.tsv', sep='\t', index_col=0)

In [56]:
user_data.head()

Unnamed: 0_level_0,review_count,elite,account_age,fans,profile,useful_votes,num_tips,num_likes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
---teJGnwK07UO6_oJfbRw,2,0,2017,0,0,0,0.0,0.0
--0HEXd4W6bJI8k7E0RxTA,34,0,1651,2,0,15,0.0,0.0
--0KsjlAThNWua2Pr4HStQ,60,2,1010,2,0,191,0.0,0.0
--0mI_q_0D1CdU4P_hoImQ,31,0,1956,1,0,17,1.0,0.0
--106arHH4D3fLenTl3YZA,1,0,921,0,0,1,0.0,0.0


In [68]:
feature_matrix = pd.read_json('data/feature_matrix.json')
feature_matrix = feature_matrix[['account_age', 'elite', 'fans', 'num_likes', 'num_tips', 'profile', 'review_count', 'useful_votes']]
feature_matrix['user_id'] = feature_matrix.index
#feature_train, feature_test = train_test_split(feature_matrix, test_size=0.3)

user_weights = pd.read_csv('user_weights.csv', index_col=0)

data = pd.merge(feature_matrix, user_weights, how='inner', on=['user_id'])
data.head()

Unnamed: 0,account_age,elite,fans,num_likes,num_tips,profile,review_count,useful_votes,user_id,Active Life,...,Financial Services,Food,Health & Medical,Home Services,Hotels & Travel,Local Services,Nightlife,Pets,Restaurants,Shopping


In [42]:
y = user_weights.loc[user_weights.index,:]
y = y['Restaurants'].as_matrix()

X = feature_matrix.as_matrix()
rows, columns = X.shape
X = np.insert(X, 0, 1, axis=1)

In [43]:
def cost(theta, X, y):
    """
    Calculates the sum of squares cost for a given set of parameters
    """
    X = np.matrix( X )
    y = np.matrix( y )
    theta = np.matrix( theta )
    
    # Predicted labels
    y_hat = X.dot( theta.T )
    
    # Sum of error terms in prediction
    SOS = np.power( np.subtract( y_hat, y ), 2 ).sum()
    
    N = y.size
    cost = ( 1.0 / ( 2 * N ) ) * SOS
    return cost

In [44]:
class LinearRegression(object):
    def __init__(self):
        self.theta = None
        
    def train(self, X, y):
        """
        Estimates linear regression parameters
        """
        rows, columns = X.shape
        # Initialize random values for weights
        self.theta = np.matrix([random.random() for _ in xrange(columns)])
        res = minimize( cost, self.theta, (X, y) )
        self.theta = res.x
        
    def predict(self, X):
        """
        Predict output for a given vector X
        """
        return X.dot(self.theta)

In [45]:
linReg = LinearRegression()


linReg.train(X, y)

MemoryError: 

In [66]:
def unit_test():
    """
    Tests functionality of linear regression class
    """
    csv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)      
    train = csv.sample( frac = 0.8, random_state = 0 )
    test = csv.drop( train.index )
    
    testReg = LinearRegression()
    
    X = train[['TV', 'Radio', 'Newspaper']].as_matrix()
    
    rows, columns = X.shape
    X = np.insert(X, 0, 1, axis=1)
    y = np.matrix(train['Sales']).T
    
    testReg.train(X, y)
    
    X_test = test[['TV', 'Radio', 'Newspaper']].as_matrix()
    X_test = np.insert(X_test, 0, 1, axis=1)
    y_test = np.matrix(test['Sales']).T
        
    if np.isclose( cost( testReg.theta, X_test, y_test ), 1.57881917886 ):
        print "Unit test passed!"
        
    else:
        print "Unit test failed : ("

In [67]:
unit_test()

Unit test passed!
