In [65]:
import pandas as pd 
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import nltk
import re


In [66]:
# Fetch data from API
url = "https://datasets-server.huggingface.co/rows?dataset=breadlicker45%2Fyoutube-comments&config=default&split=train&offset=0&length=10"
response = requests.get(url)
data_json = response.json()


In [67]:
# Extract comments dynamically
comments_key = list(data_json["rows"][0]["row"].keys())[0]
data = pd.DataFrame([row["row"] for row in data_json["rows"]])
data.rename(columns={comments_key: "Comment"}, inplace=True)
print(data.head())

KeyError: 'rows'

In [58]:
data1 = data.dropna(subset=["Comment"])

nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/naitik/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [59]:
data1["Positive"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["pos"])
data1["Negative"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["neg"])
data1["Neutral"] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["neu"])
data1['Compound'] = data1["Comment"].apply(lambda x: sentiments.polarity_scores(str(x))["compound"])
print(data1.head())

                                             Comment  Positive  Negative  \
0  I&#39;ve read many books on Apple/Steve and ha...     0.208     0.000   
1  Just watched this again after having watched i...     0.421     0.000   
2                                I had one of those!     0.000     0.000   
3      Please leave out the needless intrusive music     0.271     0.141   
4  What about the T-mobile AMEO? Was that not bef...     0.048     0.000   

   Neutral  Compound  
0    0.792    0.7163  
1    0.579    0.8802  
2    1.000    0.0000  
3    0.588    0.2732  
4    0.952    0.5661  


In [60]:
sentiment = []
for score in data1["Compound"]:
    if score >= 0.05:
        sentiment.append('Positive')
    elif score <= -0.05:
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')

data1["Sentiment"] = sentiment
print(data1)

                                             Comment  Positive  Negative  \
0  I&#39;ve read many books on Apple/Steve and ha...     0.208     0.000   
1  Just watched this again after having watched i...     0.421     0.000   
2                                I had one of those!     0.000     0.000   
3      Please leave out the needless intrusive music     0.271     0.141   
4  What about the T-mobile AMEO? Was that not bef...     0.048     0.000   
5  i can imagine all the caos all those people th...     0.000     0.104   
6                                               🥰🥰🥰🥰     0.000     0.000   
7                                 Jobs is inspiring.     0.583     0.000   
8  iPhone, that product changed my view of Steve ...     0.271     0.000   
9  Was steve jobs gay or a macho like moi who lov...     0.408     0.000   

   Neutral  Compound Sentiment  
0    0.792    0.7163  Positive  
1    0.579    0.8802  Positive  
2    1.000    0.0000   Neutral  
3    0.588    0.2732  Positive 

In [61]:
data2 = data1.drop(['Positive', 'Negative', 'Neutral', 'Compound'], axis=1)



In [62]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lzr = WordNetLemmatizer()

def text_processing(text):
    text = text.lower()  
    text = re.sub(r'\n', ' ', text)  
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text  # Keep text as simple as possible


[nltk_data] Downloading package stopwords to /home/naitik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/naitik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/naitik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
data_copy = data2.copy()
print(data_copy.head())
data_copy['Comment'] = data_copy['Comment'].astype(str).apply(text_processing)

le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

processed_data = pd.DataFrame({
    'Sentence': data_copy['Comment'],
    'Sentiment': data_copy['Sentiment']
})

negative_count = (processed_data['Sentiment'] == le.transform(['Negative'])[0]).sum()
print("Number of Negative Sentiments:", negative_count)

print(processed_data.head())

                                             Comment Sentiment
0  I&#39;ve read many books on Apple/Steve and ha...  Positive
1  Just watched this again after having watched i...  Positive
2                                I had one of those!   Neutral
3      Please leave out the needless intrusive music  Positive
4  What about the T-mobile AMEO? Was that not bef...  Positive
Number of Negative Sentiments: 1
                                            Sentence  Sentiment
0  i39ve read many books on applesteve and have n...          2
1  just watched this again after having watched i...          2
2                                 i had one of those          1
3      please leave out the needless intrusive music          2
4  what about the tmobile ameo was that not befor...          2


In [64]:
df_negative = processed_data[processed_data['Sentiment'] == 0]
df_neutral = processed_data[processed_data['Sentiment'] == 1]
df_positive = processed_data[processed_data['Sentiment'] == 2]

df_negative_upsampled = resample(df_negative, replace=True, n_samples=205, random_state=42)
df_neutral_upsampled = resample(df_neutral, replace=True, n_samples=205, random_state=42)
final_data = pd.concat([df_negative_upsampled, df_neutral_upsampled, df_positive])

corpus = final_data['Sentence'].tolist()
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data['Sentiment'].values
print(X.shape, y.shape)


(417, 116) (417,)
