In [1]:
import time, random
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from scipy.sparse import csr_matrix 	#Compressed Sparse Row matrix
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import gc #Manually collect garbage

phrasedict = pd.read_table("dictionary.txt", names= ["phrase", "phrase ids"], sep="|")
sentilabels = pd.read_table("sentiment_labels.txt", sep="|") # names = ][phrase ids, sentiment values]

print(len(phrasedict), len(sentilabels))

239232 239232


In [2]:
terms_duplicated = [word for line in phrasedict["phrase"] for word in line.split(' ')]
terms = set(terms_duplicated)

phrasedict["length"] = phrasedict["phrase"].str.len()
phrase_20000_ = phrasedict.sort_values("length", ascending=False)[:20000]
phrase_20000 = phrase_20000_.sample(frac=1)


del [phrasedict, phrase_20000_] #memory release
gc.collect()

print(phrase_20000)

                                                   phrase  phrase ids  length
219092  to capitalize on Hopkins ' inclination to play...      216926     143
181641  represents the worst kind of filmmaking , the ...      131377     138
34627   It 's hard to imagine anybody ever being `` in...      224183     230
106903  easily become a cold , calculated exercise in ...       78725     139
55849   This is a movie that is what it is : a pleasan...      226649     115
41775   Nicholson . Gone are the flamboyant mannerisms...      107992     226
10198   , the film is so mired in juvenile and near-xe...      142483     170
17599   Aggressive self-glorification and a manipulati...      143559     162
103586  director Robert J . Siegel allows the characte...      118295     109
191410  something that is so meditative and lyrical ab...      132968     228
89570   blend politics and drama , an admirable ambiti...      155295     138
93404   can get past the fantastical aspects and harsh...       

In [3]:
# Merge table with score
#merged_phrase_20000 = pd.concat(phrase_20000,sentilabels join="inner")
merged_phrase_20000 = phrase_20000.merge(sentilabels, how="inner", on="phrase ids")
#print(merged_phrase_20000)

In [4]:
# Extract tf-idf, will be normalized between 0-1
columns = merged_phrase_20000["sentiment values"]

vectorizer   = TfidfVectorizer(norm = 'l2', vocabulary=terms) #len(terms) == 22348
tfidf_matrix = vectorizer.fit_transform(merged_phrase_20000["phrase"])
tfidf_matrix = tfidf_matrix.T
row, column  = tfidf_matrix.shape
print(row, column, type(tfidf_matrix)) # 22348 terms, 20000 tweets(Documents)


22348 20000 <class 'scipy.sparse.csc.csc_matrix'>


In [5]:
pd_tfidf = pd.DataFrame(tfidf_matrix.toarray(), index=terms, columns=columns)
del tfidf_matrix
#pd_tfidf = pd.SparseDataFrame(tfidf_matrix, index=terms, columns=columns)

In [6]:
pd_tfidf.head()

sentiment values,0.44444,0.22222,0.375,0.93056,0.73611,0.43056,0.27778,0.38889,0.75,0.875,...,0.70833,0.5,0.44444.1,0.47222,0.70833.1,0.81944,0.91667,0.52778,0.79167,0.40278
concocts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
entrepreneurial,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Twin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ill-constructed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#pd_tfidf.to_csv("tf-idf_sungmin.csv", sep=",", encoding="utf-8") # about 1.7GB
#if any((0,0,0,0)) : print(1) >> print nothing
#if any((0,0,0,1)) : print(1) >> print 1

#drop_list = list()

#for index, row in pd_tfidf.iterrows():
#    if any(row.values): drop_list.append(index)
#     #else : print("find some values")
# print(drop_list)

#updated_tfidf = pd_tfidf[(pd_tfidf.T != 0).any()]
nonzero_tfidf = (pd_tfidf!=0).astype(int).sum(axis=1)


print(nonzero_tfidf) # 22348 rows × 20000 columns => 13185 rows × 20000 columns


concocts            0
Jean                0
entrepreneurial     0
Twin                0
ill-constructed     0
published           0
1970                0
menacing            0
spare               0
underestimate       0
well-meaning        0
islanders           0
styled              0
terrorizing         0
genres              0
empathize           0
Hjejle              0
indoctrinated       0
tight               0
Who                 0
clocked             0
non-techies         0
unabashedly         0
filmgoers           0
heavy-handed        0
upping              0
mystique            0
Utter               0
preordained         0
evolves             0
                   ..
forces             10
Murder              0
motorcycles         0
anticipated         6
selfish             0
Standing            9
donde               1
Rosemary           18
stereotypes         2
frequent            2
ruffle              1
jump               10
Book                0
whiplash            2
dorkier   

In [12]:
#print(type(nonzero_tfidf), nonzero_tfidf.shape)
FEATURE_NUMBER = 500
sorted_nonzero_tfidf = pd.DataFrame(nonzero_tfidf.sort_values(ascending=False)[:FEATURE_NUMBER])
print(sorted_nonzero_tfidf)
features_N = sorted_nonzero_tfidf.index
try : del [nonzero_tfidf]
except : pass
gc.collect()

print(features_N)

NameError: name 'nonzero_tfidf' is not defined

In [13]:
#tfidf = pd.merge(sorted_nonzero_tfidf, pd_tfidf, left_index=True, right_index=True)
#tfidf = pd.concat([sorted_nonzero_tfidf, pd_tfidf], axis=1)
#or pd.merge(df1, df2, left_index=True, right_index=True)
#or df1.join(df2) #by default there is left join:
#or pd.concat([df1, df2], axis=1) #by default there is outer join


#delete last coulmn : it was count of tfidf scores from above.
#tfidf.drop(tfidf.columns[len(tfidf.columns)-1], axis=1, inplace=True)
#tfidf.drop(tfidf.columns[0], axis=1, inplace=True)

tfidf = pd_tfidf.loc[features_N]

#print(tfidf)
assert [i for i in tfidf.columns if isinstance(i, str)] == []
#tfidf.drop(tfidf.columns["0.0_y"], axis=0, inplace=True)

NameError: name 'pd_tfidf' is not defined

In [14]:
#for test if it has all zero value.
try : del [pd_tfidf] #memory release
except : pass
gc.collect()

for i in tfidf.iloc[0,:]: # term "great" has more.
    if i!=0:
        print("tf-idf, somewhere is not 0 but value : ", tfidf.index[0], i)
        break   

tf-idf, somewhere is not 0 but value :  murders 0.102462656456


In [15]:
tfidf.head()

sentiment values,0.44444,0.22222,0.375,0.93056,0.73611,0.43056,0.27778,0.38889,0.75,0.875,...,0.70833,0.5,0.44444.1,0.47222,0.70833.1,0.81944,0.91667,0.52778,0.79167,0.40278
murders,0.0,0.102463,0.106051,0.0,0.0,0.07405,0.142135,0.052635,0.060462,0.042652,...,0.0,0.047635,0.059666,0.239772,0.171837,0.0,0.052894,0.0,0.0,0.0
Fork,0.0,0.116821,0.040304,0.057683,0.0,0.042213,0.054017,0.120022,0.0,0.145887,...,0.072877,0.0,0.0,0.109348,0.065305,0.154359,0.0,0.113634,0.0,0.066869
zips,0.0,0.059957,0.0,0.05921,0.0,0.086662,0.055448,0.0,0.0,0.049917,...,0.0,0.055747,0.069828,0.112244,0.067035,0.079223,0.0,0.116643,0.0,0.0
beanie,0.200049,0.069078,0.190658,0.0,0.080801,0.0,0.063882,0.0,0.163049,0.05751,...,0.0,0.0,0.08045,0.0,0.077232,0.0,0.0,0.0,0.173164,0.079081
Episode,0.0,0.079875,0.0,0.0,0.093431,0.057726,0.073868,0.082064,0.0,0.132999,...,0.099658,0.074267,0.093025,0.0,0.0,0.0,0.0,0.0,0.200231,0.0


In [16]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
tfidf = pd.DataFrame(ss.fit_transform(tfidf), index=tfidf.index, columns = tfidf.columns)
gc.collect()

10

In [17]:
tfidf.head()

sentiment values,0.44444,0.22222,0.375,0.93056,0.73611,0.43056,0.27778,0.38889,0.75,0.875,...,0.70833,0.5,0.44444.1,0.47222,0.70833.1,0.81944,0.91667,0.52778,0.79167,0.40278
murders,-0.132271,4.095418,3.50898,-0.115273,-0.122867,3.603677,5.607905,2.604622,2.106192,1.584598,...,-0.153923,2.636377,1.45908,7.725541,5.787911,-0.103191,1.644815,-0.129532,-0.158405,-0.138936
Fork,-0.132271,4.686995,1.224493,2.556131,-0.122867,2.00211,2.016743,6.098329,-0.129487,5.83108,...,1.931197,-0.131485,-0.192112,3.432014,2.090826,5.491158,-0.162513,4.520671,-0.158405,2.578604
zips,-0.132271,2.344078,-0.17593,2.626864,-0.122867,4.238125,2.075032,-0.124293,-0.129487,1.883412,...,-0.153923,3.107788,1.740304,3.527327,2.150835,2.768047,-0.162513,4.643799,-0.158405,-0.138936
beanie,8.117193,2.719876,6.448788,-0.115273,3.335907,-0.121451,2.418787,-0.124293,5.899456,2.195762,...,-0.153923,-0.131485,2.034268,-0.167713,2.504728,-0.103191,-0.162513,-0.129532,4.837859,3.074904
Episode,-0.132271,3.164752,-0.17593,-0.115273,3.876539,2.78248,2.82573,4.130369,-0.129487,5.300924,...,2.697442,4.183899,2.382268,-0.167713,-0.175538,-0.103191,-0.162513,-0.129532,5.618813,-0.138936


In [18]:
#terms are features, sentiment values are extract from phrase_20000
row, column = tfidf.shape
print(row, column)
test_percentage = 20
boundary = int(column*(1-test_percentage/100)) #16000

#change to np.array, (16000,), (4000,) 
phrases_train  = np.array(tfidf.columns[:boundary])
phrases_test   = np.array(tfidf.columns[boundary:])

print("phrases_train, test : ", phrases_train.shape, phrases_test.shape)


tfidf_train_    = tfidf.iloc[:,:boundary]
tfidf_test_    = tfidf.iloc[:,boundary:]

tfidf_train = tfidf_train_.T
tfidf_test  = tfidf_test_.T

del [tfidf_train_, tfidf_test_] #memory release
gc.collect()

print("tfidf_train, test : ", tfidf_train.shape, tfidf_test.shape)

500 20000
phrases_train, test :  (16000,) (4000,)
tfidf_train, test :  (16000, 500) (4000, 500)


In [33]:
time_start = time.time()
clf = SVR(kernel="rbf", degree=3, gamma="auto", coef0=0.0, tol=0.001, C=1.0, 
          epsilon=0.1, shrinking=True, cache_size=200, verbose=True, max_iter=-1)
clf.fit(tfidf_train, phrases_train)

print(phrases_train[:20])
print("time cose : %.3f s" %(time.time()-time_start))


[LibSVM][ 0.875    0.19444  0.45833  0.81944  0.875    0.90278  0.45833  0.33333
  0.77778  0.30556  0.75     0.70833  0.47222  0.61111  0.47222  0.75
  0.38889  0.51389  0.27778  0.61111]
time cose : 12.235 s


In [35]:
time_start = time.time()
clf = SVR(kernel="rbf", degree=3, gamma="auto", coef0=0.0, tol=0.001, C=1.0, 
          epsilon=0.1, shrinking=True, cache_size=200, verbose=True, max_iter=-1)
clf.fit(tfidf_train, phrases_train)

print(phrases_train[:20])
print("time cose : %.3f s" %(time.time()-time_start))


[LibSVM][ 0.56944  0.36111  0.19444  0.125    0.33333  0.88889  0.       0.83333
  0.76389  0.54167  0.16667  0.5      0.20833  0.18056  0.88889  0.43056
  0.47222  0.70833  0.68056  0.33333]
time cose : 333.613 s


In [36]:
#what I have to do is,
#predict(X=[some_Values])
#score(X, y) y is true for X.
#from sklearn.metrics import accuracy_score
time_start  = time.time()
#rbf_predict = clf.predict(tfidf_test)
rbf_score = clf.score(tfidf_test, phrases_test)
print("RBF prediction is :  %.3f%% " % (rbf_score*100), " \ntime cose : %.3f s" %(time.time()-time_start))

RBF prediction is :  64.757%   
time cose : 19.219 s


In [34]:
#what I have to do is,
#predict(X=[some_Values])
#score(X, y) y is true for X.
#from sklearn.metrics import accuracy_score
time_start  = time.time()
#rbf_predict = clf.predict(tfidf_test)
rbf_score = clf.score(tfidf_test, phrases_test)
print("RBF prediction is :  %.3f%% " % (rbf_score*100), " \ntime cose : %.3f s" %(time.time()-time_start))

RBF prediction is :  60.473%   
time cose : 2.909 s


In [35]:
#del [clf] #memory release
gc.collect()

time_start = time.time()
clf2 = SVR(kernel="linear", degree=3, gamma="auto", coef0=0.0, tol=0.001, C=1.0, 
          epsilon=0.1, shrinking=True, cache_size=200, verbose=True, max_iter=-1)
clf2.fit(tfidf_train, phrases_train)
print("time cose : %.3f s" %(time.time()-time_start))
#took about 65minutes

[LibSVM]time cose : 19.154 s


In [36]:
time_start = time.time()
linear_score = clf2.score(tfidf_test, phrases_test)
print("Linear prediction is :  %.3f%% " % (linear_score*100), " \ntime cose : %.3f s" %(time.time()-time_start))
#score : 0.2588100489097338 for 500 terms.

Linear prediction is :  27.984%   
time cose : 2.609 s


In [37]:
clf3 = SVR(kernel="poly", degree=3, gamma="auto", coef0=0.0, tol=0.001, C=1.0, 
          epsilon=0.1, shrinking=True, cache_size=200, verbose=True, max_iter=-1)
clf3.fit(tfidf_train, phrases_train)

[LibSVM]

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=True)

In [38]:
time_start = time.time()
poly_score = clf3.score(tfidf_test, phrases_test)
print("Linear prediction is :  %.3f%% " % (poly_score*100), " \ntime cose : %.3f s" %(time.time()-time_start))
#score : 0.2588100489097338 for 500 terms.

Linear prediction is :  55.959%   
time cose : 2.664 s


In [25]:
from sklearn.grid_search import GridSearchCV
print("Fitting the classifier to the training set")
#param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']}
param_grid = {'C': [0.01, 1, 100], 'kernel': ['rbf', 'linear']}

clf4 = GridSearchCV(SVR(), param_grid)
clf4 = clf4.fit(tfidf_train, phrases_train)
print("Best estimator found by grid search:")
print(clf4.best_estimator_)
#it took more than 30min

Fitting the classifier to the training set
Best estimator found by grid search:
SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [26]:
clf4.score(tfidf_test,phrases_test)

0.27422574329624794

In [19]:
time_start = time.time()

from sklearn.svm import LinearSVR
clf5 = LinearSVR()
clf5.fit(tfidf_train, phrases_train)
print("time cose : %.3f s" %(time.time()-time_start))

time cose : 25.680 s


In [20]:
time_start = time.time()
linear_score = clf5.score(tfidf_test, phrases_test)
print("Linear model data can explain data with :  %.3f%% " % (linear_score*100), " \ntime cose : %.3f s" %(time.time()-time_start))
#score : 0.2588100489097338 for 500 terms.

Linear model data can explain data with :  -36.057%   
time cose : 0.074 s


In [24]:
from sklearn.grid_search import GridSearchCV
print("Fitting the classifier to the training set")
#param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']}
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

clf6 = GridSearchCV(LinearSVR(random_state=42), param_grid)
clf6 = clf6.fit(tfidf_train, phrases_train)
print("Best estimator found by grid search:")
print(clf6.best_estimator_)
#it took more than 30min

Fitting the classifier to the training set
Best estimator found by grid search:
LinearSVR(C=0.01, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=42, tol=0.0001, verbose=0)


In [26]:
linear_score = clf6.score(tfidf_test, phrases_test)
print("Linear model data can explain data with :  %.3f%% " % (linear_score*100))

Linear model data can explain data with :  23.274% 


In [None]:
#save model
from sklearn.externals import joblib
>>> joblib.dump(clf, 'filename.pkl') 

#load model
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])