In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# Read the train and test datasets
train_data = pd.read_csv("train_file.csv")
test_data = pd.read_csv("test_file.csv")

In [3]:
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/u189689/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [5]:
train_data['processed_title'] = train_data['Title'].apply(preprocess_text)
test_data['processed_title'] = test_data['Title'].apply(preprocess_text)

In [6]:
train_data['processed_headline'] = train_data['Headline'].apply(preprocess_text)
test_data['processed_headline'] = test_data['Headline'].apply(preprocess_text)

In [7]:
# Concatenate the processed title and headline into a single feature
train_data['text'] = train_data['processed_title'] + ' ' + train_data['processed_headline']
test_data['text'] = test_data['processed_title'] + ' ' + test_data['processed_headline']

In [8]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])

In [9]:
train_data['sentiment_title_label'] = train_data['SentimentTitle'].apply(lambda x: 1 if x >= 0 else 0)
train_data['sentiment_headline_label'] = train_data['SentimentHeadline'].apply(lambda x: 1 if x >= 0 else 0)

In [10]:
# Combined the sentiment labels for title and headline into a single label
train_data['sentiment_label'] = train_data.apply(lambda x: x['sentiment_title_label'] if x['sentiment_title_label'] != 0 else x['sentiment_headline_label'], axis=1)

In [11]:
model = MultinomialNB()
model.fit(X_train, train_data['sentiment_label'])

MultinomialNB()

In [12]:
test_data['predicted_sentiment_label'] = model.predict(X_test)

In [16]:
# Splited the training data into a training set and a validation set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, train_data['sentiment_label'], test_size=0.2, random_state=42)

In [17]:
# Train the model on the training set
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [18]:
# Predict the sentiment labels for the validation set
y_pred = model.predict(X_val)

In [19]:
# The accuracy score of the model on the validation set
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7454187896665773
