In [1]:
import pandas as pd

movies = pd.read_csv('hutto_movies.csv')
movies.set_index(['id'], inplace = True) 
print(movies.head().round(2))
movies.describe().round(2)

    sentiment                                               text
id                                                              
1        2.27  The Rock is destined to be the 21st Century's ...
2        3.53  The gorgeously elaborate continuation of ''The...
3       -0.60                     Effective but too tepid biopic
4        1.47  If you sometimes like to go to the movies to h...
5        1.73  Emerges as something rare, an issue movie that...


Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [2]:
#P.66 先使用casual_tokenize分解文本，計算token出現的次數，將其顯示出來
import pandas as pd
pd.set_option('display.width', 75)#show出較寬的DataFrame格式

from nltk.tokenize import casual_tokenize #比其他tokenizer好，可處理表情符號、標點符號、里語
bags_of_words = []

from collections import Counter
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))#Counter()每個token出現的次數，返回字典形式{word:次數}
df_bows = pd.DataFrame.from_records(bags_of_words)
#將字典的key轉為column name，value放在適當的位置，如果該column下沒有value就會返回NaN
df_bows = df_bows.fillna(0).astype(int)#NaN會顯示成float形式，所以缺失值補0後轉為整數
print(df_bows.shape)
print(df_bows.head())
df_bows.head()[list(bags_of_words[0].keys())]
#只顯示第一句有的字，bags_of_words[0].keys()是字典形式，需轉成list才能指定column

(10605, 20756)
   The  Rock  is  destined  to  be  the  21st  Century's  new  ...  Ill  \
0    1     1   1         1   2   1    1     1          1    1  ...    0   
1    2     0   1         0   0   0    1     0          0    0  ...    0   
2    0     0   0         0   0   0    0     0          0    0  ...    0   
3    0     0   1         0   4   0    1     0          0    0  ...    0   
4    0     0   0         0   0   0    0     0          0    0  ...    0   

   slummer  Rashomon  dipsticks  Bearable  Staggeringly  ’  ve  \
0        0         0          0         0             0  0   0   
1        0         0          0         0             0  0   0   
2        0         0          0         0             0  0   0   
3        0         0          0         0             0  0   0   
4        0         0          0         0             0  0   0   

   muttering  dissing  
0          0        0  
1          0        0  
2          0        0  
3          0        0  
4          0     

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Schwarzenegger,",",Jean,Claud,Van,Damme,or,Steven,Segal,.
0,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [4]:
#P.67 以機器學習貝是分類器來測量情緒分數
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)#y轉乘True&False
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
#貝氏分類器output=>0~1，真實分數是-4~4，先做轉換以比較準確度
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()#絕對值看誤差
print(pd.Series(movies.error.mean()).round(1))#round只能用在series
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)#真實情緒好(正)的為1，不好(負)為0
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)#預測情緒好(正)的為1，不好(負)為0
print((movies.predicted_ispositive ==  movies.sentiment_ispositive).sum() / len(movies))
movies['''sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'''.split()].head(8)

0    2.4
dtype: float64
0.9344648750589345


Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1
6,2.533333,4,1,1
7,2.466667,4,1,1
8,1.266667,-4,1,0


In [5]:
#P.68 用movies建模的model去跑products
products = pd.read_csv('hutto_products.csv')
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)
df_all_bows = df_bows.append(df_product_bows)#將products的詞袋與前面的movies做結合
df_all_bows = df_all_bows.fillna(0).astype(int)
print(df_all_bows.columns)
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns]#將貝氏建模時使用的movie的column抓出來
print(df_product_bows.shape)
print(df_bows.shape)
products['ispos'] = (products.sentiment > 0).astype(int)
products['predicted_ispositive'] = nb.predict(df_product_bows).astype(int)
print((products.predicted_ispositive == products.ispos).sum() / len(products))
products.head()

Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st',
       'Century's', 'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23302)
(3546, 20756)
(10605, 20756)
0.5572476029328821


Unnamed: 0,id,sentiment,text,ispos,predicted_ispositive
0,1_1,-0.9,troubleshooting ad-2500 and ad-2600 no picture...,0,0
1,1_2,-0.15,"repost from january 13, 2004 with a better fit...",0,0
2,1_3,-0.2,does your apex dvd player only play dvd audio ...,0,0
3,1_4,-0.1,or does it play audio and video but scrolling ...,0,0
4,1_5,-0.5,before you try to return the player or waste h...,0,0
