In [1]:
# import data frame
import pandas as pd

df = pd.read_csv('data_cleaned.csv')

In [2]:
# Construct bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
df['text_cleaned'] = df['text_cleaned'].astype(str)

# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(df['text_cleaned'])

df_bow = pd.DataFrame(bow.todense())

df_bow


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1599998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Train test split
from sklearn.model_selection import train_test_split

# bow 
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow, df['sentiment'], test_size=0.1, random_state=42)


In [4]:
# decision tree - bow
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

dct = DecisionTreeClassifier(criterion='entropy', random_state=42)
dct.fit(X_train_bow,y_train_bow)

dct_bow = dct.predict_proba(X_test_bow)

dct_bow


array([[0.        , 1.        ],
       [0.83333333, 0.16666667],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [5]:
# (Decision tree) Prepare the result array for evaluation
dct_bow_result = []

for row in dct_bow:
    if row[0] > 0.5:
        dct_bow_result.append(0)
    else:
        dct_bow_result.append(1)


In [6]:
# %% Logistic regression - BOW
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')
Log_Reg.fit(X_train_bow,y_train_bow)
log_bow = Log_Reg.predict_proba(X_test_bow)

log_bow

array([[0.56351477, 0.43648523],
       [0.82117088, 0.17882912],
       [0.60303591, 0.39696409],
       ...,
       [0.98688848, 0.01311152],
       [0.29723905, 0.70276095],
       [0.04347508, 0.95652492]])

In [7]:
# (Logistic regression) Prepare the result array for evaluation
log_bow_result = []

for row in log_bow:
    if row[0] > 0.5:
        log_bow_result.append(0)
    else:
        log_bow_result.append(1)


In [21]:
# Evaluations
from sklearn.metrics import precision_recall_fscore_support
dct_evaluation = precision_recall_fscore_support(y_test_bow, dct_bow_result, average='micro')
print('dct_presicion = ' + str(dct_evaluation[0]))
print('dct_recall = ' + str(dct_evaluation[1]))
print('dct_f_measure = ' + str(dct_evaluation[2]))
print()

log_evaluation = precision_recall_fscore_support(y_test_bow, log_bow_result, average='micro')
print('log_presicion = ' + str(log_evaluation[0]))
print('log_recall = ' + str(log_evaluation[1]))
print('log_f_measure = ' + str(log_evaluation[2]))
print()


dct_presicion = 0.69443125
dct_recall = 0.69443125
dct_f_measure = 0.69443125

log_presicion = 0.74360625
log_recall = 0.74360625
log_f_measure = 0.7436062500000001



In [22]:
# baseline - mark all as the sentiment occured more often:
from collections import Counter
sentiments_count = list(Counter(y_train_bow).values())
sentiments_count[0]
if (sentiments_count[0] > sentiments_count[1]):
    baseline = [0] * len(dct_bow_result)
else:
    baseline = [1] * len(dct_bow_result)
baseline_evaluation = precision_recall_fscore_support(y_test_bow, baseline, average='micro')
print('baseline_presicion = ' + str(baseline_evaluation[0]))
print('baseline_recall = ' + str(baseline_evaluation[1]))
print('baseline_f_measure = ' + str(baseline_evaluation[2]))


baseline_presicion = 0.50109375
baseline_recall = 0.50109375
baseline_f_measure = 0.50109375
