In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Read data
df1 = pd.read_csv("C:\\Users\\manik\\OneDrive\\Documents\\Data_sets\\sentimental_analysis\\train.txt", delimiter=";", names=['text', 'label'])
df2 = pd.read_csv("C:\\Users\\manik\\OneDrive\\Documents\\Data_sets\\sentimental_analysis\\val.txt", delimiter=";", names=['text', 'label'])
df = pd.concat([df1, df2])

# Tokenization
def tokenize(text):
    return word_tokenize(text)

# Lowercasing
def to_lowercase(tokens):
    return [token.lower() for token in tokens]

# Remove punctuation
def remove_punctuation(tokens):
    translator = str.maketrans('', '', string.punctuation)
    return [token.translate(translator) for token in tokens]

# Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    return [token for token in tokens if token.lower() not in stop_words]

# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply preprocessing steps
df['text'] = df['text'].apply(tokenize)
df['text'] = df['text'].apply(to_lowercase)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_stopwords)
df['text'] = df['text'].apply(lemmatize)

# Convert token lists back to strings
df['text'] = df['text'].apply(lambda tokens: ' '.join(tokens))

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
x = tfidf_vectorizer.fit_transform(df['text'])
X = x.toarray()

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)

In [9]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.8497222222222223


In [10]:
example_texts = [
    "I absolutely loved this movie!",
    "The customer service was terrible. I will never go back to that restaurant again.",
    "Today's weather is beautiful.",
    "The traffic was unbearable this morning.",
    "The book was difficult to understand, but it was worth the effort.",
    "The new smartphone is amazing. I'm so impressed with its features.",
    "I'm feeling really happy and excited today!",
    "The internet connection is so slow. It's frustrating.",
    "The concert last night was fantastic. The band played all my favorite songs.",
    "I'm feeling really anxious and stressed about the upcoming exam."
]

for text in example_texts:
    # Preprocess and vectorize the new text
    new_text_tokens = tokenize(text)
    new_text_tokens = to_lowercase(new_text_tokens)
    new_text_tokens = remove_punctuation(new_text_tokens)
    new_text_tokens = remove_stopwords(new_text_tokens)
    new_text_tokens = lemmatize(new_text_tokens)
    new_text_vectorized = tfidf_vectorizer.transform([' '.join(new_text_tokens)]).toarray()

    # Make predictions on the new text
    new_text_prediction = model.predict(new_text_vectorized)

    # Convert numerical prediction back to label category
    predicted_label_category = encoder.inverse_transform(new_text_prediction)

    print(f"Text: {text} - Predicted Label: {predicted_label_category[0]}")


Text: I absolutely loved this movie! - Predicted Label: joy
Text: The customer service was terrible. I will never go back to that restaurant again. - Predicted Label: sadness
Text: Today's weather is beautiful. - Predicted Label: joy
Text: The traffic was unbearable this morning. - Predicted Label: joy
Text: The book was difficult to understand, but it was worth the effort. - Predicted Label: sadness
Text: The new smartphone is amazing. I'm so impressed with its features. - Predicted Label: surprise
Text: I'm feeling really happy and excited today! - Predicted Label: joy
Text: The internet connection is so slow. It's frustrating. - Predicted Label: joy
Text: The concert last night was fantastic. The band played all my favorite songs. - Predicted Label: joy
Text: I'm feeling really anxious and stressed about the upcoming exam. - Predicted Label: sadness


In [11]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear',C=1.0,random_state=42)
svm_model.fit(x_train,y_train)

y_svm_pred = svm_model.predict(x_test)
print(accuracy_score(y_test,y_svm_pred))

0.8591666666666666


In [23]:
import pickle
with open("sentiment_analysis_encoder.pkl","wb") as f:
    pickle.dump((svm_model,encoder),f)

In [24]:
new_text_tokens = tokenize("hello i am very pleased today .")
new_text_tokens = to_lowercase(new_text_tokens)
new_text_tokens = remove_punctuation(new_text_tokens)
new_text_tokens = remove_stopwords(new_text_tokens)
new_text_tokens = lemmatize(new_text_tokens)
new_text_vectorized = tfidf_vectorizer.transform([' '.join(new_text_tokens)]).toarray()

In [27]:
with open("sentiment_analysis_encoder.pkl","rb") as f:
    loaded_model,loaded_encoder = pickle.load(f)
predicted_ans = loaded_model.predict(new_text_vectorized)
predicted_label_category = loaded_encoder.inverse_transform(predicted_ans)
predicted_label_category

array(['joy'], dtype=object)

naive bayes accuracy - 80
randomforest accuracy - 81 
logistic regression - 84