In [3]:
import pandas as pd
import pickle
from src.data_utils import preprocess_text
import json

# Load the model, TfidfVectorizer, and LabelEncoder
model = pickle.load(open('../models/logistic_regression_model.pkl', 'rb'))
vectorizer =  pickle.load(open('../models/vectorizer.pkl', 'rb'))
label_encoder = pickle.load(open('../models/encoder.pkl', 'rb'))

# Load test data from json file and convert to pandas dataframe
test_data = pd.DataFrame(json.load(open('../data/raw/test.json', 'r')))

# Handle missing values and duplicates in the test data
test_data.dropna(inplace=True)
test_data.drop_duplicates(inplace=True)

# Apply the preprocess_text function to idea titles in the test data
test_data['idea_title_processed'] = test_data['idea_title'].apply(preprocess_text)

# Group by map_id to aggregate idea titles
grouped_test = test_data.groupby('map_id').agg({
    'idea_title_processed': ' '.join,
    'map_title': 'first'
}).reset_index()

# Concatenate map_title and aggregated idea_title_processed
grouped_test['text_processed'] = grouped_test['map_title'] + ' ' + grouped_test['idea_title_processed']

# Transform the concatenated text using the loaded TfidfVectorizer
test_tfidf_matrix = vectorizer.transform(grouped_test['text_processed'])

# Make predictions using the loaded model
test_preds = model.predict(test_tfidf_matrix)

# Decode the predictions using the loaded LabelEncoder
test_preds_decoded = label_encoder.inverse_transform(test_preds)
print({
    'predictions': test_preds_decoded.tolist()[0],
})


{'predictions': 'Business'}
