In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv("data.csv",names=["x","y"])
data.head()

Unnamed: 0,x,y
0,32.502345,31.707006
1,53.426804,68.777596
2,61.530358,62.562382
3,47.47564,71.546632
4,59.813208,87.230925


In [24]:
def best_fit_line(x,y):
    y_ = np.mean(y)
    x_ = np.mean(x)
    numerator = np.sum(x*y-y_*x)
    denominator = np.sum(x*x-x_*x)
    B = numerator/denominator
    a = y_ - B*x_
    return B,a

def best_fit_line2(x,y):
    x = x.reshape(-1,1)
    x = np.insert(x,1,1,axis=1)
    y = y.reshape(-1,1)
    return np.matmul(np.linalg.pinv(np.matmul(x.T,x)),np.matmul(x.T,y))

print(best_fit_line(data["x"].values,data["y"].values))
print(best_fit_line2(data["x"].values,data["y"].values))

(1.3224310227553542, 7.991020982270669)
[[1.32243102]
 [7.99102098]]


In [56]:
def error(x,y,B,a):
    x = x.reshape(-1,1)
    y = y.reshape(-1,1)
    return np.mean((y-x*B+a)**2)

def gradient_descent(X,Y,lr,delta):
    B,a = 0,0
    n = len(X)
    error_old = 0
    error_new = error(X,Y,B,a)
    while abs(error_old-error_new) > delta:
        B_,a_ = 0,0
        for x,y in zip(X,Y):
            B_ = B_ + lr*(y-(B*x+a))*x
            a_ = a_ + lr*(y-(B*x+a))*1
        B += B_/n
        a += a_/n
        error_old = error_new
        error_new = error(X,Y,B,a)
        print(error_old-error_new)
    return B,a

gradient_descent(data["x"].values,data["y"].values,0.00005,0.00001)

1273.138684865341
975.760682518001
747.8562988264703
573.1935186721175
439.3330137046937
336.74193934511163
258.11480583396053
197.85303713590588
151.66613776924885
116.26605198301394
89.13293947934062
68.33568015537338
52.39431965145167
40.17465566372536
30.80745491347622
23.62654551785934
18.121374308642146
13.900649212253597
10.664476562540955
8.1830084361128
6.280078899846643
4.820663485947307
3.7012685851937306
2.842566179867603
2.1837492958247964
1.6782061348455528
1.2902047873022298
0.9923524141603366
0.7636479552871975
0.5879896984610298
0.453031442407422
0.3493058158665008
0.26955234201300016
0.208202419051716
0.16098456231692637
0.1246218167900679
0.09659981167229148
0.07498895803756511
0.05830814490070679
0.045420242844571135
0.03545198804626182
0.02773255438320632
0.021746450840950615
0.017097400386575146
0.013480637381377392
0.010661659097436882
0.008459925591040474
0.00673635373745185
0.005383720672810455
0.004319298400972116
0.003479199607966166
0.0028140360549144816
0.0

(1.4788726628830617, 0.030314143442741262)

In [61]:
reviews_data = pd.read_csv("chicago_hotel_reviews.csv")
reviews_data.head()

Unnamed: 0.1,Unnamed: 0,docid,review,rating
0,0,usa_illinois_chicago_the_talbott_hotel,\tWonderful\tI had a sore throat and the hotel...,4.672131
1,1,usa_illinois_chicago_sofitel_chicago_water_tower,Nov 20 2009 \tDidn't want to check out...\tPer...,4.601329
2,2,usa_illinois_chicago_trump_international_hotel...,\tWOW!!\tGreat hotel! Stayed there with my wif...,4.696296
3,3,usa_illinois_chicago_hampton_inn_majestic_chicago,\tStaff Went Above \tWhile waiting for our gue...,4.595455
4,4,usa_illinois_chicago_residence_inn_chicago_dow...,\tWonderful Hotel Will Definately Stay Here Ag...,4.645


In [76]:
n = len(reviews_data)
n_train = int(0.8*n)
n_test = n-n_train

index = np.random.choice(n,n,replace=False)
trainSet = reviews_data.iloc[index[:n_train]]
testSet = reviews_data.iloc[index[n_train:]]

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
Xtotal = vectorizer.fit_transform(reviews_data["review"])
Xtrain = Xtotal[index[:n_train]]
Xtest = Xtotal[index[n_train:]]

(125, 3319)


In [143]:
print(vectorizer.get_feature_names())



In [144]:
# from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(Xtrain,trainSet["rating"])
print(model)
result = model.predict(Xtest)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


In [108]:
result

array([3.85874182, 3.39954554, 3.56727667, 3.77566212, 3.64299127,
       3.62054867, 3.40688425, 3.90766914, 4.24189634, 3.60724375,
       4.19698343, 4.29704535, 3.73577052, 3.6342023 , 3.84764935,
       3.95702287, 1.0625    , 3.35273611, 4.11196055, 3.87354009,
       1.0625    , 3.83111144, 3.60211379, 3.40669211, 1.0625    ,
       3.85080167, 3.56160078, 4.48581816, 3.76208716, 4.13370562,
       4.196473  , 3.67085621])

In [115]:
model.coef_

array([ 0.04056363,  0.06807067,  0.        , ..., -0.10846741,
        0.        ,  0.06466693])

In [118]:
c = np.absolute(model.coef_)

In [135]:
selected_features = np.where(c>0.5)[0]
print("selected features:",np.array(vectorizer.get_feature_names())[selected_features])

selected features: ['about' 'are' 'area' 'bum' 'chateau' 'chicago' 'don' 'drugs' 'great'
 'guy' 'halfway' 'her' 'hours' 'incoming' 'kids' 'live' 'needed' 'never'
 'not' 'office' 'on' 'placeholder' 'plastic' 'prostition' 'put' 'rampant'
 'refund' 'said' 'service' 'sheets' 'stay' 'travelers' 'trying' 'type'


In [136]:
model = LinearRegression().fit(Xtrain[:,selected_features],trainSet["rating"])
result = model.predict(Xtest[:,selected_features])

In [137]:
result

array([ 3.92246232,  3.44928927,  3.0894422 ,  3.98615185,  4.05930271,
        4.16069415,  4.00896462,  4.01116858,  4.25644691,  3.99521471,
        4.39577984,  3.45127819,  3.67700502,  3.30966998,  4.73296404,
        3.90674733,  1.0625    ,  2.96111234,  2.96291476,  3.97642537,
        1.0625    ,  4.24758791,  4.4222018 ,  4.02253179,  1.0625    ,
        4.17145887, -0.0765281 ,  4.24461816,  4.33460353,  3.76675329,
        4.38486534,  3.41257804])

In [138]:
mse = np.mean((result-testSet["rating"])**2)

In [139]:
print(mse)

2.7356735262584317


In [142]:
result[np.argsort(result)][::-1]

array([ 4.73296404,  4.4222018 ,  4.39577984,  4.38486534,  4.33460353,
        4.25644691,  4.24758791,  4.24461816,  4.17145887,  4.16069415,
        4.05930271,  4.02253179,  4.01116858,  4.00896462,  3.99521471,
        3.98615185,  3.97642537,  3.92246232,  3.90674733,  3.76675329,
        3.67700502,  3.45127819,  3.44928927,  3.41257804,  3.30966998,
        3.0894422 ,  2.96291476,  2.96111234,  1.0625    ,  1.0625    ,
        1.0625    , -0.0765281 ])