In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
from src.data_utils import preprocess_text


# Load the dataset
data_path = "../data/raw/public_maps.csv"
data = pd.read_csv(data_path)

# Handle missing values and duplicates
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

# Apply the preprocess_text function
data['idea_title_processed'] = data['idea_title'].apply(preprocess_text)

# Group by map_id to aggregate idea titles
grouped = data.groupby('map_id').agg({
    'idea_title_processed': ' '.join,
    'map_title': 'first',
    'map_category_name': 'first',
    'map_rating': 'first'
}).reset_index()


# Concatenate map_title and aggregated idea_title_processed
grouped['text_processed'] = grouped['map_title'] + ' ' + grouped['idea_title_processed']

# Initialize the TfidfVectorizer and LabelEncoder
vectorizer = TfidfVectorizer()
encoder = LabelEncoder()

tfidf_matrix = vectorizer.fit_transform(grouped['text_processed'])
map_category_encoded = encoder.fit_transform(grouped['map_category_name'])

# Determine the smallest class count to set k_neighbors
min_class_count = min(pd.Series(map_category_encoded).value_counts())

# Address class imbalance using SMOTE
smote = SMOTE(k_neighbors=min(min_class_count - 1, 5))  # Use 5 or min_class_count-1, whichever is smaller
tfidf_matrix_resampled, map_category_encoded_resampled = smote.fit_resample(tfidf_matrix, map_category_encoded)

data_processed = pd.DataFrame(tfidf_matrix_resampled.toarray())
data_processed['map_category_encoded'] = map_category_encoded_resampled

# Save the processed data
data_processed.to_pickle('../data/processed/data_processed.pkl')
pickle.dump(vectorizer, open('../models/vectorizer.pkl', 'wb'))
pickle.dump(encoder, open('../models/encoder.pkl', 'wb'))
