In [2]:
import pandas as pd

print("Starting engineering of 'text_length' and 'word_count' features...")

# --- Load the dataset ---
try:
    data = pd.read_csv('stock(updated).csv')
    print("Dataset 'stock(updated).csv' loaded successfully.")
except FileNotFoundError:
    print("Error: 'stock(updated).csv' not found. Make sure it's in the same directory.")
    exit()

# --- Ensure 'combined_text' column exists ---
# This is crucial for calculating text_length and word_count consistently.
# It uses .fillna('') to handle potential missing values in headline/short_description.
if 'combined_text' not in data.columns:
    print("Creating 'combined_text' column...")
    data['combined_text'] = data['headline'].fillna('') + ' ' + data['short_description'].fillna('')
else:
    print("'combined_text' column already exists.")

# --- Engineer 'text_length' feature ---
print("Engineering 'text_length' feature...")
data['text_length'] = data['combined_text'].apply(len)
print(f"Sample 'text_length' values:\n{data['text_length'].head()}")

# --- Engineer 'word_count' feature ---
print("Engineering 'word_count' feature...")
data['word_count'] = data['combined_text'].apply(lambda x: len(x.split()))
print(f"Sample 'word_count' values:\n{data['word_count'].head()}")

# --- Save the updated DataFrame back to CSV ---
data.to_csv('stock(updated).csv', index=False)
print("\nFeatures 'text_length' and 'word_count' (and 'combined_text' if new) successfully engineered and saved back to 'stock(updated).csv'.")
print(f"Updated DataFrame shape: {data.shape}")
print(f"Updated DataFrame columns: {data.columns.tolist()}")

Starting engineering of 'text_length' and 'word_count' features...
Dataset 'stock(updated).csv' loaded successfully.
'combined_text' column already exists.
Engineering 'text_length' feature...
Sample 'text_length' values:
0    231
1    249
2    234
3    233
4    187
Name: text_length, dtype: int64
Engineering 'word_count' feature...
Sample 'word_count' values:
0    40
1    41
2    36
3    39
4    31
Name: word_count, dtype: int64

Features 'text_length' and 'word_count' (and 'combined_text' if new) successfully engineered and saved back to 'stock(updated).csv'.
Updated DataFrame shape: (1759, 5015)
