# Import required libraries

In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# Load data and perform preprocess function

In [None]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
data.head(10)

In [None]:
data.drop([1, 2, 3, 4], axis=1, inplace=True)
data.head(10)

In [None]:
data[0] = data[0].replace({0: 'Negative', 2: 'Neutral', 4: 'Positive'})
data = data[data[0].isin(['Positive', 'Negative'])]
data.reset_index(drop=True, inplace=True)

stop_words = set(stopwords.words('english'))



In [None]:
stemmer = PorterStemmer()
def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text if word.isalpha()]
    text = [word for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

data[5] = data[5].apply(preprocess)

In [None]:
data.head(10)

# Split data into training sets

In [None]:
X = data[5]
y = data[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Vectorize the data

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.fit_transform(X_test)

# Train and Evaluate the model

In [None]:
trainer = MultinomialNB()
trainer.fit(X_train_vec, y_train)
y_pred = trainer.predict(X_test_vec)

print(classification_report(y_test, y_pred))