In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import string
import nltk
import re

# Load dataset
data = pd.read_csv('../input/blackadam-trailer-comments/comments.csv')

# Drop unnecessary columns
data1 = data.drop(['Unnamed: 0', 'Likes', 'Time', 'user', 'UserLink'], axis=1)

# Download NLTK VADER lexicon
nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()

# Apply sentiment analysis
data1["Positive"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["pos"])
data1["Negative"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["neg"])
data1["Neutral"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["neu"])
data1['Compound'] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["compound"])

# Assign sentiment labels
sentiment = []
for score in data1["Compound"]:
    if score >= 0.05:
        sentiment.append('Positive')
    elif score <= -0.05:
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')

data1["Sentiment"] = sentiment

# Drop sentiment score columns
data2 = data1.drop(['Positive', 'Negative', 'Neutral', 'Compound'], axis=1)

# Download stopwords
nltk.download('stopwords')
nltk.download('omw-1.4')

# Initialize text processing tools
stop_words = set(stopwords.words('english'))
lzr = WordNetLemmatizer()

def text_processing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\n', ' ', text)  # Remove new lines
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove multiple spaces
    text = ' '.join([lzr.lemmatize(word) for word in word_tokenize(text) if word not in stop_words])
    return text

# Apply text preprocessing
data_copy = data2.copy()
data_copy['Comment'] = data_copy['Comment'].astype(str).apply(text_processing)

# Encode sentiment labels
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

# Create processed dataset
processed_data = pd.DataFrame({
    'Sentence': data_copy['Comment'],
    'Sentiment': data_copy['Sentiment']
})

# Balance dataset using upsampling
df_negative = processed_data[processed_data['Sentiment'] == 0]
df_neutral = processed_data[processed_data['Sentiment'] == 1]
df_positive = processed_data[processed_data['Sentiment'] == 2]

df_negative_upsampled = resample(df_negative, replace=True, n_samples=205, random_state=42)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=205, random_state=42)
final_data = pd.concat([df_negative_upsampled, df_neutral_upsampled, df_positive])

# Convert text data to numerical features
corpus = final_data['Sentence'].tolist()
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data['Sentiment'].values

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Evaluate model
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
nb_score = accuracy_score(y_test, y_pred)
print('Accuracy:', nb_score)
