# Sentiment Analyst

## Corpus

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

### import file

In [19]:
df_train = pd.read_csv("sa.csv")
df_test = pd.read_csv("test.csv")

In [20]:
df_train

Unnamed: 0,Title,Tone
0,# SpeakYourMind : YEP hosts free mental health...,1
1,Ed Sheeran and Ella Eyre lead the way as stars...,1
2,Day of play to say You Matter : youths host fi...,1
3,Bryony Gordon helps Meet Cambridge raise over ...,1
4,Patients to benefit from mental health unit in...,1
5,Share a Cuppa this World Mental Health Day at ...,1
6,Dallas - Fort Worth Hospital Council Foundatio...,1
7,Thinking about mental health month,1
8,Nine in 10 united kingdom Workers Touched by M...,1
9,Longford locals to light up night for mental h...,1


### To Array

In [21]:
corpus_train = []
for index, row in df_train.iterrows() :
    corpus_train.append({'text': row['Title'], 'sentiment': str(row['Tone'])})
corpus_test = []
for index, row in df_test.iterrows() :
    corpus_test.append({'text': row['Title'], 'sentiment': str(row['Tone'])})

### Create data as test and train

In [22]:
train_corpus = pd.DataFrame.from_dict(corpus_train)
X_train = train_corpus.text
y_train = train_corpus.sentiment

test_corpus = pd.DataFrame.from_dict(corpus_test)
X_test = test_corpus.text
y_test = test_corpus.sentiment

## Testing, Predicting using TF-IDF, HASHING OR COUNT

In [23]:
testing = ['Teenager being offensive towards patient with autism', 
           'New mental institution in U.S', 
           'A lot of people get benefits from the new mental health institution',
           'Man suffered from depression', 
           'Free mental health aid']

## TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words='english')
X_train_dtm = tfidf_vect.fit_transform(X_train)
X_test_dtm = tfidf_vect.transform(X_test)

In [25]:
print(tfidf_vect.vocabulary_)

{'speakyourmind': 133, 'yep': 166, 'hosts': 71, 'free': 59, 'mental': 95, 'health': 65, 'aid': 8, 'course': 33, 'leeds': 84, 'ed': 45, 'sheeran': 129, 'ella': 46, 'eyre': 50, 'lead': 83, 'way': 157, 'stars': 136, 'throw': 147, 'weight': 159, 'music': 100, 'gog': 60, 'day': 37, 'play': 109, 'say': 124, 'matter': 92, 'youths': 169, 'host': 70, 'time': 149, 'event': 47, 'bryony': 17, 'gordon': 62, 'helps': 67, 'meet': 94, 'cambridge': 19, 'raise': 116, '2k': 3, 'charity': 22, 'business': 18, 'weekly': 158, 'technology': 145, 'news': 102, 'patients': 107, 'benefit': 16, 'unit': 153, 'investment': 76, 'share': 128, 'cuppa': 34, 'world': 161, 'thyme': 148, 'lake': 82, 'dallas': 36, 'fort': 57, 'worth': 163, 'hospital': 69, 'council': 32, 'foundation': 58, 'receive': 117, 'hhsc': 68, 'grant': 63, 'state': 137, 'reform': 118, 'thinking': 146, 'month': 97, '10': 0, 'united': 154, 'kingdom': 81, 'workers': 160, 'touched': 152, 'challenges': 21, 'accenture': 4, 'research': 120, 'finds': 54, 'long

In [26]:
pd.DataFrame(X_train_dtm.toarray(), columns = tfidf_vect.get_feature_names()).head()

Unnamed: 0,10,102,1240,2k,accenture,adults,affects,african,aid,alcoholism,...,workers,world,worsens,worth,yates,year,yep,york,youth,youths
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.371174,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.371174,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.358173
3,0.0,0.0,0.0,0.239468,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Naive Bayes

In [27]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [28]:
from sklearn import metrics

%time nb.fit(X_train_dtm, y_train)
#Prediksi
y_pred_class = nb.predict(X_test_dtm)
print("Prediksi: ", y_pred_class)
print("Benar: ", y_test.values)
#Nilai akurasi, presisi
akurasi = metrics.accuracy_score(y_test, y_pred_class)
presisi = metrics.precision_score(y_test, y_pred_class, average='weighted')
recall = metrics.recall_score(y_test, y_pred_class, average='weighted')
conf_matrix = metrics.confusion_matrix(y_test, y_pred_class)
print("Akurasi: ", akurasi, "\nPresisi: ", presisi, "\nRecall: ", recall, "\nConfussion Matrix:\n", conf_matrix)

Wall time: 2 ms
Prediksi:  ['1' '1' '1' '0' '-1' '0' '-1' '-1' '-1']
Benar:  ['1' '1' '1' '0' '0' '0' '-1' '-1' '-1']
Akurasi:  0.8888888888888888 
Presisi:  0.9166666666666666 
Recall:  0.8888888888888888 
Confussion Matrix:
 [[3 0 0]
 [1 2 0]
 [0 0 3]]


In [30]:
testing = ['Teenager being offensive towards patient with autism', 
           'New mental institution in U.S', 
           'A lot of people get benefits from the new mental health institution',
           'Man suffered from depression', 
           'Free mental health aid']

new_article_vect = tfidf_vect.transform(testing)
nb.predict(new_article_vect)

array(['-1', '0', '-1', '-1', '1'], dtype='<U2')

## HASHING

In [47]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vect = HashingVectorizer (alternate_sign=False, stop_words='english')
X_train_dtm_hash = hash_vect.fit_transform(X_train)
X_test_dtm_hash = hash_vect.transform(X_test)

In [48]:
print(X_train_dtm_hash.shape)
print(X_train_dtm_hash.toarray())

(30, 1048576)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [49]:
%time nb.fit(X_train_dtm_hash, y_train)
#Prediksi
y_pred_class = nb.predict(X_test_dtm_hash)
print("Prediksi: ", y_pred_class)
print("Benar: ", y_test.values)
#Nilai akurasi, presisi
akurasi = metrics.accuracy_score(y_test, y_pred_class)
presisi = metrics.precision_score(y_test, y_pred_class, average='weighted')
recall = metrics.recall_score(y_test, y_pred_class, average='weighted')
conf_matrix = metrics.confusion_matrix(y_test, y_pred_class)
print("Akurasi: ", akurasi, "\nPresisi: ", presisi, "\nRecall: ", recall, "\nConfussion Matrix:\n", conf_matrix)

Wall time: 982 ms
Prediksi:  ['1' '1' '1' '1' '-1' '0' '-1' '-1' '0']
Benar:  ['1' '1' '1' '0' '0' '0' '-1' '-1' '-1']
Akurasi:  0.6666666666666666 
Presisi:  0.6388888888888888 
Recall:  0.6666666666666666 
Confussion Matrix:
 [[2 1 0]
 [1 1 1]
 [0 0 3]]


In [50]:
# PREDIKSI LIST TITLE (VAR TESTING)
new_article_vect = hash_vect.transform(testing)
nb.predict(new_article_vect)

array(['-1', '0', '-1', '-1', '1'], dtype='<U2')

## COUNT

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer( stop_words='english')
X_train_dtm_count = count_vect.fit_transform(X_train)
X_test_dtm_count = count_vect.transform(X_test)

In [57]:
print(count_vect.vocabulary_)

{'speakyourmind': 133, 'yep': 166, 'hosts': 71, 'free': 59, 'mental': 95, 'health': 65, 'aid': 8, 'course': 33, 'leeds': 84, 'ed': 45, 'sheeran': 129, 'ella': 46, 'eyre': 50, 'lead': 83, 'way': 157, 'stars': 136, 'throw': 147, 'weight': 159, 'music': 100, 'gog': 60, 'day': 37, 'play': 109, 'say': 124, 'matter': 92, 'youths': 169, 'host': 70, 'time': 149, 'event': 47, 'bryony': 17, 'gordon': 62, 'helps': 67, 'meet': 94, 'cambridge': 19, 'raise': 116, '2k': 3, 'charity': 22, 'business': 18, 'weekly': 158, 'technology': 145, 'news': 102, 'patients': 107, 'benefit': 16, 'unit': 153, 'investment': 76, 'share': 128, 'cuppa': 34, 'world': 161, 'thyme': 148, 'lake': 82, 'dallas': 36, 'fort': 57, 'worth': 163, 'hospital': 69, 'council': 32, 'foundation': 58, 'receive': 117, 'hhsc': 68, 'grant': 63, 'state': 137, 'reform': 118, 'thinking': 146, 'month': 97, '10': 0, 'united': 154, 'kingdom': 81, 'workers': 160, 'touched': 152, 'challenges': 21, 'accenture': 4, 'research': 120, 'finds': 54, 'long

In [58]:
pd.DataFrame(X_train_dtm_count.toarray(), columns = count_vect.get_feature_names()).head()

Unnamed: 0,10,102,1240,2k,accenture,adults,affects,african,aid,alcoholism,...,workers,world,worsens,worth,yates,year,yep,york,youth,youths
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
%time nb.fit(X_train_dtm_count, y_train)
#Prediksi
y_pred_class = nb.predict(X_test_dtm_count)
print("Prediksi: ", y_pred_class)
print("Benar: ", y_test.values)
#Nilai akurasi, presisi
akurasi = metrics.accuracy_score(y_test, y_pred_class)
presisi = metrics.precision_score(y_test, y_pred_class, average='weighted')
recall = metrics.recall_score(y_test, y_pred_class, average='weighted')
conf_matrix = metrics.confusion_matrix(y_test, y_pred_class)
print("Akurasi: ", akurasi, "\nPresisi: ", presisi, "\nRecall: ", recall, "\nConfussion Matrix:\n", conf_matrix)

Wall time: 2 ms
Prediksi:  ['1' '1' '1' '1' '-1' '0' '-1' '-1' '-1']
Benar:  ['1' '1' '1' '0' '0' '0' '-1' '-1' '-1']
Akurasi:  0.7777777777777778 
Presisi:  0.8333333333333334 
Recall:  0.7777777777777778 
Confussion Matrix:
 [[3 0 0]
 [1 1 1]
 [0 0 3]]


In [60]:
# PREDIKSI LIST TITLE (VAR TESTING)
new_article_vect = count_vect.transform(testing)
nb.predict(new_article_vect)

array(['-1', '0', '-1', '-1', '1'], dtype='<U2')