In [12]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle
from src.data_utils import preprocess_text

# Load the dataset
data_path = "../data/raw/public_maps.csv"  # Replace with the path to your file
data = pd.read_csv(data_path)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Check for duplicate entries
duplicate_rows = data.duplicated().sum()
print("\nNumber of duplicate rows: ", duplicate_rows)



# Apply the preprocess_text function to map_title and idea_title
data['map_title_processed'] = data['map_title'].apply(preprocess_text)
data['idea_title_processed'] = data['idea_title'].apply(preprocess_text)

# Concatenate map_title_processed and idea_title_processed
data['text_processed'] = data['map_title_processed'] + ' ' + data['idea_title_processed']

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on our corpus
tfidf_matrix = vectorizer.fit_transform(data['text_processed'])

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit and transform the encoder on map_category_name
map_category_encoded = encoder.fit_transform(data['map_category_name'])

# Create a new dataframe for the encoded data
data_encoded = pd.DataFrame({
    'tfidf_matrix': list(tfidf_matrix.toarray()),
    'map_category_encoded': map_category_encoded
})

# Save the vectorizer and encoder objects
pickle.dump(vectorizer, open('../models/vectorizer.pkl', 'wb'))
pickle.dump(encoder, open('../models/encoder.pkl', 'wb'))

# Save the encoded data
data_encoded.to_pickle('../data/processed/data_encoded.pkl')


Missing values:
 map_id                 0
map_title              0
map_rating             0
map_category_name      0
idea_id                0
idea_parent_id       462
idea_title             0
dtype: int64

Number of duplicate rows:  0
