In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Adjust display options
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_colwidth', None)  # Show full column width

df = pd.read_excel('../Data/legendary_lord_defeat_traits.xlsx')

In [24]:
# Define the labeling criteria
def label_trait(effect):
    # Define criteria for 'strong'
    strong_keywords = ['+30%', '+40%', '+50%', 'immense', 'huge', 'enormous', 'greatly', '+20%', '+25%', '+15%']
    # Define criteria for 'weak'
    weak_keywords = ['+2%', '+5%', '+1%', 'slight', 'minor', 'small', '+10%', '+8%', '+6%']
    # Define criteria for 'balanced'
    balanced_keywords = ['+10%', '+12%', 'moderate', '+15%', '+8%', 'both', 'combination']

    # Convert to lower case for case-insensitive matching
    effect_lower = effect.lower()
    
    # Apply the criteria to classify the effect
    if any(keyword in effect_lower for keyword in strong_keywords):
        return 'strong'
    elif any(keyword in effect_lower for keyword in weak_keywords):
        return 'weak'
    elif any(keyword in effect_lower for keyword in balanced_keywords):
        return 'balanced'
    else:
        # Default to balanced if no strong/weak indicators are found
        return 'balanced'

# Apply the labeling function to the DataFrame
df['Label'] = df['Effect'].apply(label_trait)

In [27]:
df.columns

Index(['Trait', 'Lord that must be defeated', 'Effect', 'Description',
       'Label'],
      dtype='object')

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Stem the words
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the 'Effect' and 'Description' column
df['Processed_Effect'] = df['Effect'].apply(preprocess_text)
df['Processed_Description'] = df['Description'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lardex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lardex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
