In [1]:
import numpy as np                   
import pandas as pd                  
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
model = MultinomialNB()

In [2]:
data = pd.read_csv("Apple-Twitter-Sentiment-DFE.csv", sep=';', encoding='utf-8', header = 0)

In [3]:
headers = data.iloc[0]
data.columns = headers
data = data.iloc[1:]

In [20]:
filtered_data = pd.DataFrame()
filtered_data['sentiment'] = data['sentiment']
filtered_data['sentiment confidence'] = data['sentiment:confidence']
filtered_data['text'] = data['text']
text_data = pd.DataFrame({'text': filtered_data['text']})
text_data.shape

(3886, 1)

In [5]:
def create_vocabulary_list(data):
    vocabulary = set()
    for text in data['text']:
        words = str(text).lower().split()
        vocabulary.update(words)

    vocabulary = {word for word in vocabulary if "http://" not in word}
    vocabulary = {word for word in vocabulary if "https" not in word}
    vocabulary = {word for word in vocabulary if "@" not in word}
    return vocabulary

In [6]:
def create_document_term_matrix(data, vocabulary_list):

    document_word_counts = []
    
    for _, row in data.iterrows():
        document = str(row['text']).lower()
        word_counts = {word: document.count(word) for word in vocabulary_list}
        document_word_counts.append(word_counts)

    # Create a DataFrame from the list of dictionaries
    document_term_matrix = pd.DataFrame(document_word_counts)
    document_term_matrix.index = range(1, len(document_term_matrix) + 1)
    document_term_matrix = document_term_matrix.fillna(0)  # Fill NaN values with 0
    return document_term_matrix

In [7]:
def combine_two_data_frames(df1, df2):
    df2.reset_index(drop=True, inplace=True)
    df1.reset_index(drop=True, inplace=True)

    # Combine the two DataFrames horizontally
    combined_df = pd.concat([df1, df2], axis=1)
    combined_df = combined_df[combined_df['sentiment'] != "not_relevant"]

    # Reset the index of the DataFrame
    combined_df.reset_index(drop=True, inplace=True)
    return combined_df

In [8]:
def check_text_with_document_term_matrix(row_index):

    # Check if the specified row contains '1.0' and print the corresponding column names
    value_to_check = 1.0
    columns_with_1 = combined_df.columns[combined_df.iloc[row_index] == value_to_check].tolist()

    print(combined_df.loc[row_index, 'text'])
    print(combined_df.loc[row_index, 'sentiment'])
    print(columns_with_1)

In [9]:
vocabulary_list = create_vocabulary_list(text_data)

In [10]:
document_term_matrix = create_document_term_matrix(text_data, vocabulary_list)

In [11]:
combined_df = combine_two_data_frames(filtered_data, document_term_matrix)

In [12]:
check_text_with_document_term_matrix(1)

text    RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
text                                                    0
Name: 1, dtype: object
3
['today', 'ad', 'ck', 'ay', '.', '#aapl', 'fl', ':/', 'crash', 'day', '9', 'why', 'n', 'stock', 'flash', '#a', 'ha', 's:', 'log', 'min', 'in', 'st', 'ya', 'w', 'de', 'm', '$aapl', 'had', 'rt', 'da', '#', '-', '$', 'la', '//', 'mini-flash', '0', 'mini']


In [13]:
X = combined_df.iloc[:, 3:]
y = combined_df['sentiment']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.5)

In [15]:
model.fit(X_train,Y_train)

In [16]:
y_pred = model.predict(X_test)
from sklearn.metrics import classification_report
classification = classification_report(Y_test,y_pred)
print(classification)

              precision    recall  f1-score   support

           1       0.65      0.77      0.71       617
           3       0.75      0.82      0.78      1066
           5       0.81      0.06      0.11       219

    accuracy                           0.71      1902
   macro avg       0.74      0.55      0.53      1902
weighted avg       0.73      0.71      0.68      1902

