In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer

In [85]:
youtube = pd.read_csv('YoutubeCommentsDataSet.csv')

In [86]:
youtube.shape

(18408, 2)

In [87]:
youtube.duplicated().sum().item()

531

In [88]:
youtube = youtube.drop_duplicates(subset=['Comment'],keep='first')

In [89]:
youtube.isnull().sum()

Comment      1
Sentiment    0
dtype: int64

In [90]:
youtube  = youtube.dropna(subset=['Comment'])

In [91]:
youtube.isnull().sum()

Comment      0
Sentiment    0
dtype: int64

In [92]:
youtube.shape

(17871, 2)

In [93]:
import re
import string

In [94]:
import re
import string
from nltk.corpus import stopwords

def text_clean(text):
    if text is None:
        return ""
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs and numbers
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords, except for "not"
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')  # Ensure "not" is not considered a stop word
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    
    text = text.strip()
    
    return text


In [107]:
text_clean("I never like this place")

'never like place'

In [96]:
youtube['Comment'] = youtube['Comment'].apply(text_clean)

In [97]:
youtube['Comment'][11]

'lab excit thing ive seen reallli go shake qualiiti even basic compon manufactur'

In [98]:
def sentiment_to_label(sentiment):
    if sentiment == 'positive':
        return 2
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'negative':
        return 0
    else:
        return None  

In [99]:
youtube['Label'] = youtube['Sentiment'].apply(sentiment_to_label)

In [100]:
youtube['Comment'][5]

'we’v hound bank adopt appl pay understand don’t want extra fee easi quick checkout'

In [101]:
# Step 3: Split the dataset into features (X) and target (y)
X = youtube['Comment']
y = youtube['Label']

# Step 5: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
# Step 4: Convert text data into numerical data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [103]:
# Step 6: Train the Logistic Regression model (for multi-class classification)
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_tfidf, y_train)

In [104]:
# Step 7: Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [105]:
# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.7541258741258742


In [106]:
import pickle
# Step 9: Save the model and vectorizer using pickle
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)  # Save the model

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
