In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

quotes_1k = pd.read_csv('datasets/insparationalQuotes1K.csv')
quotes_1k = quotes_1k[['Quote', 'Category']]
quotes_1k.columns = ['quote', 'category']

#claning
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).strip()             
    text = ' '.join(text.split())         #extra spaces
    text = text.lower()                  #lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  #non-alphabetical characters
    return text
quotes_1k['quote'] = quotes_1k['quote'].apply(clean_text)
quotes_1k['category'] = quotes_1k['category'].str.strip().str.lower()

#shuffle dataset
quotes_1k = quotes_1k.sample(frac=1, random_state=10).reset_index(drop=True)

#sampling (1000)
sampled_quotes = quotes_1k.sample(n=1000, random_state=10).reset_index(drop=True)

#label encoding
le = LabelEncoder()
sampled_quotes['category_encoded'] = le.fit_transform(sampled_quotes['category'])

import pickle
with open('./models/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

#save dataset
sampled_quotes.to_csv('datasets/cleanedDataSecondModel.csv', index=False)
sampled_quotes.head()


A


Unnamed: 0,quote,category,category_encoded
0,a generous heart kind speech and a life of ser...,stewardship,104
1,education breeds confidence,confidence,18
2,a smile is a curve that sets everything straight,smile,100
3,it is not joy that makes us grateful it is gra...,gratitude,45
4,we say things will turn out but no its what we...,stewardship,104


In [2]:
# bersigin negative.txt
import pandas as pd
import re

with open('datasets/negative.txt', 'r', encoding='utf-8') as f:
    raw_negative = f.readlines()

def clean_text(text):
    text = str(text).strip()
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  #punctuation
    return text

#buang baris kosong
cleaned_negative = [clean_text(line) for line in raw_negative if line.strip() != '']

negative_df = pd.DataFrame(cleaned_negative, columns=['quote'])
negative_df.insert(0, 'label', 0)  #label 0 non-motivational
negative_df = negative_df.sample(n=1000, random_state=42).reset_index(drop=True)

positive_df = pd.read_csv('datasets/cleanedDataSecondModel.csv')
positive_df = positive_df[['quote']]
positive_df['quote'] = positive_df['quote'].astype(str).apply(clean_text)
positive_df.insert(0, 'label', 1)  #label 1 motivational



#gabungan negative&positive
binary_df = pd.concat([positive_df, negative_df], ignore_index=True)
binary_df = binary_df.sample(frac=1, random_state=42).reset_index(drop=True)
#save dataset
binary_df.to_csv('datasets/binaryDataFirstModel.csv', index=False)
print(binary_df.head())


   label                                              quote
0      0  children  christian or otherwise  deserve to h...
1      1  what lies behind you and what lies in front of...
2      0  none of this socalled satire has any sting to ...
3      1  compassion is the sometimes fatal capacity for...
4      0  like all abstract art  the film does not make ...


In [7]:
import numpy as np

In [1]:
# Re-import necessary packages due to code state reset
import pandas as pd

# Load the dataset again
df = pd.read_csv("datasets/cleanedDataSecondModel.csv")

# Set the frequency threshold
threshold = 5

# Identify rare categories
category_counts = df['category'].value_counts()
rare_categories = category_counts[category_counts < threshold].index

# Replace rare categories with 'other'
df['category'] = df['category'].apply(lambda x: 'other' if x in rare_categories else x)

# Save the modified dataset
merged_path = "datasets/cleanedDataSecondModel_merged.csv"
df.to_csv(merged_path, index=False)

merged_path


'datasets/cleanedDataSecondModel_merged.csv'

In [1]:
label_map = {
    "confidence": "Self & Inner Growth",
    "self-care": "Self & Inner Growth",
    "purpose": "Self & Inner Growth",
    "ambition": "Self & Inner Growth",
    "perseverance": "Self & Inner Growth",
    "determination": "Self & Inner Growth",
    "grit": "Self & Inner Growth",
    "patience": "Self & Inner Growth",
    "motivation": "Self & Inner Growth",
    "hope": "Self & Inner Growth",
    "believe in yourself": "Self & Inner Growth",
    "live your dreams": "Self & Inner Growth",
    "live life": "Self & Inner Growth",
    "soul": "Self & Inner Growth",
    "humility": "Self & Inner Growth",
    "courage": "Self & Inner Growth",
    "optimism": "Self & Inner Growth",
    "wisdom": "Self & Inner Growth",
    "practice": "Self & Inner Growth",
    "preparation": "Self & Inner Growth",
    "drive": "Self & Inner Growth",
    "health": "Self & Inner Growth",
    "fitness": "Self & Inner Growth",
    "hard work": "Self & Inner Growth",
    "achievement": "Self & Inner Growth",
    "overcoming": "Self & Inner Growth",
    "spread your wings": "Self & Inner Growth",

    "friendship": "Relationships & Empathy",
    "love": "Relationships & Empathy",
    "empathy": "Relationships & Empathy",
    "unity": "Relationships & Empathy",
    "kindness": "Relationships & Empathy",
    "helping others": "Relationships & Empathy",
    "including others": "Relationships & Empathy",
    "inclusion": "Relationships & Empathy",
    "respect": "Relationships & Empathy",
    "listening": "Relationships & Empathy",
    "caring": "Relationships & Empathy",
    "compassion": "Relationships & Empathy",
    "loyalty": "Relationships & Empathy",
    "equality": "Relationships & Empathy",
    "courtesy": "Relationships & Empathy",
    "good manners": "Relationships & Empathy",
    "sharing": "Relationships & Empathy",
    "family": "Relationships & Empathy",
    "common ground": "Relationships & Empathy",

    "joy": "Positivity & Emotional Well-being",
    "smile": "Positivity & Emotional Well-being",
    "cheer": "Positivity & Emotional Well-being",
    "inspiration": "Positivity & Emotional Well-being",
    "encouragement": "Positivity & Emotional Well-being",
    "gratitude": "Positivity & Emotional Well-being",
    "appreciation": "Positivity & Emotional Well-being",
    "peace": "Positivity & Emotional Well-being",
    "mindfulness": "Positivity & Emotional Well-being",
    "laughter": "Positivity & Emotional Well-being",
    "true beauty": "Positivity & Emotional Well-being",
    "imagine": "Positivity & Emotional Well-being",

    "volunteering": "Altruism & Service",
    "giving back": "Altruism & Service",
    "stewardship": "Altruism & Service",
    "service": "Altruism & Service",
    "making a difference": "Altruism & Service",
    "leadership": "Altruism & Service",
    "responsibility": "Altruism & Service",
    "role model": "Altruism & Service",
    "innovation": "Altruism & Service",

    "learning": "Learning & Knowledge",
    "education": "Learning & Knowledge",
    "literacy": "Learning & Knowledge",
    "creativity": "Learning & Knowledge",
    "wisdom": "Learning & Knowledge",
    "imagination": "Learning & Knowledge",

    "other": "Other"
}

DEFAULT_LABEL = "Other"

In [5]:
import pandas as pd

def generalize_labels(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    
    if "category" not in df.columns:
        raise ValueError("Input CSV must contain a 'label' column.")

    df["general_label"] = df["category"].map(lambda x: label_map.get(x.strip().lower(), DEFAULT_LABEL))
    
    df.to_csv(output_csv, index=False)
    print(f"Generalized labels written to '{output_csv}'")

generalize_labels("datasets/cleanedDataSecondModel_merged.csv", "datasets/cleanedDataSecondModel_merged.csv")

Generalized labels written to 'datasets/cleanedDataSecondModel_merged.csv'


In [9]:
from sklearn.preprocessing import LabelEncoder
quotes = pd.read_csv('datasets/cleanedDataSecondModel_merged.csv')

le = LabelEncoder()
quotes['general_label_encoded'] = le.fit_transform(quotes['general_label'])
quotes.to_csv('datasets/cleanedDataSecondModel_merged.csv', index=False)

In [13]:
# Count and display how many quotes are in each category
import pandas as pd
df = pd.read_csv("datasets/cleanedDataSecondModel_merged.csv")
pd.set_option('display.max_rows', None)
category_counts = df['general_label'].value_counts()
print(category_counts)

general_label
Self & Inner Growth                  300
Relationships & Empathy              228
Other                                204
Positivity & Emotional Well-being    148
Altruism & Service                    73
Learning & Knowledge                  47
Name: count, dtype: int64
