In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import string # for milestone 1
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2025-sep-dl-gen-ai-project/sample_submission.csv
/kaggle/input/2025-sep-dl-gen-ai-project/train.csv
/kaggle/input/2025-sep-dl-gen-ai-project/test.csv


# making a baseline dummy submssion for now

In [2]:
train_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/train.csv')
test_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/test.csv')

In [3]:
emotion_labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']

In [4]:
X_train = train_df['text']
Y_train = train_df[emotion_labels]
X_test = test_df['text']
test_ids = test_df['id']

In [5]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))

In [6]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"feature matrix (train): {X_train_tfidf.shape}")
print(f"feature matrix shape (test): {X_test_tfidf.shape}")

feature matrix (train): (6827, 20000)
feature matrix shape (test): (1707, 20000)


In [7]:
log_reg = LogisticRegression(solver='liblinear', random_state=42, C=1.0)

In [8]:
multi_target_classifier = MultiOutputClassifier(log_reg, n_jobs=-1)
multi_target_classifier.fit(X_train_tfidf, Y_train)

In [9]:
Y_pred_train = multi_target_classifier.predict(X_train_tfidf)

In [10]:
f1_train = f1_score(Y_train, Y_pred_train, average='macro')
f1_train

0.5852257618207916

In [11]:
Y_pred_test = multi_target_classifier.predict(X_test_tfidf)

submission_df = pd.DataFrame(Y_pred_test, columns=emotion_labels)

submission_df.insert(0, 'id', test_ids)

for col in emotion_labels:
    submission_df[col] = submission_df[col].astype(int)

submission_df.to_csv('submission.csv', index=False)

In [12]:
submission_df.head()

Unnamed: 0,id,anger,fear,joy,sadness,surprise
0,0,0,1,0,0,0
1,1,0,0,0,0,0
2,2,0,1,0,0,0
3,3,0,1,0,0,0
4,4,0,1,0,0,1


# milestones


### milestone 1


In [13]:
train_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/train.csv')
train_df

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,emotions
0,0,the dentist that did the work apparently did a...,1,0,0,1,0,['anger' 'sadness']
1,1,i'm gonna absolutely ~~suck~~ be terrible duri...,0,1,0,1,0,['fear' 'sadness']
2,2,"bridge: so leave me drowning calling houston, ...",0,1,0,1,0,['fear' 'sadness']
3,3,after that mess i went to see my now ex-girlfr...,1,1,0,1,0,['anger' 'fear' 'sadness']
4,4,"as he stumbled i ran off, afraid it might some...",0,1,0,0,0,['fear']
...,...,...,...,...,...,...,...,...
6822,6822,there is not a cloud in the sky and the sun is...,0,0,1,0,0,['joy']
6823,6823,&gt; the grave stomper,0,0,0,0,1,['surprise']
6824,6824,my ear was still freaking stuck.,1,1,0,0,0,['anger' 'fear']
6825,6825,i felt like there was an electric current flow...,0,1,0,1,0,['fear' 'sadness']


In [14]:
test_df = pd.read_csv('/kaggle/input/2025-sep-dl-gen-ai-project/test.csv')
test_df

Unnamed: 0,id,text
0,0,she wanted to fight over every single little t...
1,1,"anyway, back to tuesday."
2,2,she shrieked at the dog to go back.
3,3,yelling for everyone to get back or get inside...
4,4,still kind of freaky.
...,...,...
1702,1702,"back to earth as i opened my eye, i was lying ..."
1703,1703,"english class, grade 8."
1704,1704,"i went at it twice, until the muscles around m..."
1705,1705,my heart.


In [15]:
# Q1 : Which emotion is the most common in the given dataset?

angered = train_df['anger'].sum()
feared = train_df['fear'].sum()
joyed = train_df['joy'].sum()
saddened = train_df['sadness'].sum()
surprised = train_df['surprise'].sum()

print(f"anger : {angered}")
print(f"fear : {feared}")
print(f"joy : {joyed}")
print(f"sadness : {saddened}")
print(f"surprise : {surprised}\n")
print(f"total : {angered + feared + joyed + saddened + surprised}")

anger : 808
fear : 3860
joy : 1660
sadness : 2171
surprise : 1999

total : 10498


In [16]:
train_df.emotions.value_counts()

emotions
['joy']                                        1083
['fear' 'sadness']                             1070
['fear']                                        910
['fear' 'surprise']                             799
[]                                              676
['sadness']                                     327
['fear' 'sadness' 'surprise']                   292
['surprise']                                    266
['joy' 'surprise']                              260
['anger' 'fear' 'sadness']                      178
['anger' 'fear']                                159
['anger']                                       157
['fear' 'joy']                                  132
['anger' 'fear' 'sadness' 'surprise']           104
['anger' 'fear' 'surprise']                      97
['fear' 'joy' 'surprise']                        76
['joy' 'sadness']                                48
['anger' 'sadness']                              48
['sadness' 'surprise']                           36
['f

In [17]:
# Q2 : What is the total count of such instances in the training set that have exactly 2 labels?

(train_df['emotions'].map(len) == 2).sum()

676

In [18]:
train_df[(train_df['joy'] == 1) & (train_df['sadness'] == 1)].shape[0]

96

In [19]:
(train_df['surprise'].sum() / len(train_df)) * 100

29.280796836091987

In [20]:
labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
print(train_df[labels].sum(),"\n")
train_df[labels].sum().max() - train_df[labels].sum().min()

anger        808
fear        3860
joy         1660
sadness     2171
surprise    1999
dtype: int64 



3052

In [21]:
train_df['word_count'] = train_df['text'].str.split().map(len)

median_word_length = train_df['word_count'].median()

median_word_length

13.0

In [22]:
round(train_df['anger'].corr(train_df['fear']), 2)

0.08

In [23]:
before_count = train_df['text'].str.len().sum()

normalized_train_df = train_df.copy()

normalized_train_df['text'] = normalized_train_df['text'].str.lower()

normalized_train_df['text'] = normalized_train_df['text'].str.replace(f"[{string.punctuation}]", "", regex=True)

after_count = normalized_train_df['text'].str.len().sum()

((before_count - after_count)/before_count)*100

3.262557304978664

In [24]:
import nltk
from nltk.corpus import stopwords

stopwords.words('english')[:5]

['a', 'about', 'above', 'after', 'again']

In [25]:
stop_words = set(stopwords.words('english')) # not needed but i still did it, best practice ig

all_words = " ".join(train_df['text']).split()

unique_words = set(all_words)

common_stopwords = unique_words.intersection(stop_words)

percentage_stopwords = (len(common_stopwords) / len(unique_words)) * 100

print(round(percentage_stopwords, 2))

1.39


In [26]:
filtered_words = [word for word in all_words if word not in stop_words]

word_counts = Counter(filtered_words)

fifth_most_common = word_counts.most_common(5)[4]

fifth_most_common

('back', 296)