In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier

In [18]:
# Step 1: Load the Data
ltable = pd.read_csv('ltable.csv', encoding='ISO-8859-1')
rtable = pd.read_csv('rtable.csv', encoding='ISO-8859-1')
train_data = pd.read_csv('train.csv', encoding='ISO-8859-1')
test_data = pd.read_csv('test_HW2.csv', encoding='ISO-8859-1')

# Step 2: Preprocess the Data
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

ltable['title'] = ltable['title'].apply(preprocess_text)
rtable['title'] = rtable['title'].apply(preprocess_text)
ltable['category'] = ltable['category'].apply(preprocess_text)
rtable['category'] = rtable['category'].apply(preprocess_text)
ltable['brand'] = ltable['brand'].apply(preprocess_text)
rtable['brand'] = rtable['brand'].apply(preprocess_text)

In [19]:
# Step 3: Feature Engineering
# Cosine similarity on 'title' using TF-IDF
vectorizer = TfidfVectorizer().fit(ltable['title'].append(rtable['title']))
ltable_title_vec = vectorizer.transform(ltable['title'])
rtable_title_vec = vectorizer.transform(rtable['title'])

# Merge the training set with ltable and rtable to get features
train = train_data.merge(ltable, on='ltable_id').merge(rtable, on='rtable_id', suffixes=('_l', '_r'))
test = test_data.merge(ltable, on='ltable_id').merge(rtable, on='rtable_id', suffixes=('_l', '_r'))

# Calculate cosine similarity for 'title'
train['title_cosine_sim'] = [cosine_similarity(ltable_title_vec[i], rtable_title_vec[j])[0][0]
                             for i, j in zip(train['ltable_id'], train['rtable_id'])]
test['title_cosine_sim'] = [cosine_similarity(ltable_title_vec[i], rtable_title_vec[j])[0][0]
                            for i, j in zip(test['ltable_id'], test['rtable_id'])]

  vectorizer = TfidfVectorizer().fit(ltable['title'].append(rtable['title']))


In [20]:
# Step 1: Vectorize 'brand' and 'category' attributes
# Combine brand and category fields from both tables to fit the vectorizer
brand_vectorizer = TfidfVectorizer().fit(ltable['brand'].append(rtable['brand']))
category_vectorizer = TfidfVectorizer().fit(ltable['category'].append(rtable['category']))

# Transform the brand and category columns in both tables
ltable_brand_vec = brand_vectorizer.transform(ltable['brand'])
rtable_brand_vec = brand_vectorizer.transform(rtable['brand'])
ltable_category_vec = category_vectorizer.transform(ltable['category'])
rtable_category_vec = category_vectorizer.transform(rtable['category'])

# Step 2: Calculate similarity for each pair in train and test
# Create dictionaries to map each ltable and rtable ID to the corresponding vector index
ltable_id_to_index = {id: idx for idx, id in enumerate(ltable['ltable_id'])}
rtable_id_to_index = {id: idx for idx, id in enumerate(rtable['rtable_id'])}

# Define a function to get cosine similarity given ltable and rtable IDs
def get_cosine_similarity(ltable_id, rtable_id, ltable_vec, rtable_vec):
    l_idx = ltable_id_to_index[ltable_id]
    r_idx = rtable_id_to_index[rtable_id]
    return cosine_similarity(ltable_vec[l_idx], rtable_vec[r_idx])[0][0]

# Calculate brand and category cosine similarity for train and test
train['brand_cosine_sim'] = [
    get_cosine_similarity(lid, rid, ltable_brand_vec, rtable_brand_vec)
    for lid, rid in zip(train['ltable_id'], train['rtable_id'])
]
test['brand_cosine_sim'] = [
    get_cosine_similarity(lid, rid, ltable_brand_vec, rtable_brand_vec)
    for lid, rid in zip(test['ltable_id'], test['rtable_id'])
]

train['category_cosine_sim'] = [
    get_cosine_similarity(lid, rid, ltable_category_vec, rtable_category_vec)
    for lid, rid in zip(train['ltable_id'], train['rtable_id'])
]
test['category_cosine_sim'] = [
    get_cosine_similarity(lid, rid, ltable_category_vec, rtable_category_vec)
    for lid, rid in zip(test['ltable_id'], test['rtable_id'])
]

  brand_vectorizer = TfidfVectorizer().fit(ltable['brand'].append(rtable['brand']))
  category_vectorizer = TfidfVectorizer().fit(ltable['category'].append(rtable['category']))


In [21]:
# Update feature columns
feature_columns = ['title_cosine_sim', 'brand_cosine_sim', 'category_cosine_sim']
X_train = train[feature_columns]
y_train = train['label']  # Ground truth labels are only available in train data
X_test = test[feature_columns]

In [22]:
# Step 5: Train the Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [23]:
# Step 6: Make Predictions on the Test Set
test['label'] = model.predict(X_test)
test = test.sort_values(by=['id'])
# Save predictions to a CSV file in the format expected for submission
test[['id', 'label']].to_csv('test_predictions.csv', index=False)
print("Predictions saved to 'test_predictions.csv'")

Predictions saved to 'test_predictions.csv'
