In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split



In [5]:
# Step 1: Load the Data
ltable = pd.read_csv('ltable.csv', encoding='ISO-8859-1')
rtable = pd.read_csv('rtable.csv', encoding='ISO-8859-1')
train_data = pd.read_csv('train.csv', encoding='ISO-8859-1')
test_data = pd.read_csv('test_HW2.csv', encoding='ISO-8859-1')



In [6]:
# Step 2: Preprocess the Data
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

ltable['title'] = ltable['title'].apply(preprocess_text)
rtable['title'] = rtable['title'].apply(preprocess_text)
ltable['category'] = ltable['category'].apply(preprocess_text)
rtable['category'] = rtable['category'].apply(preprocess_text)
ltable['brand'] = ltable['brand'].apply(preprocess_text)
rtable['brand'] = rtable['brand'].apply(preprocess_text)



In [7]:
# Step 3: Feature Engineering
# Cosine similarity on 'title' using TF-IDF
vectorizer = TfidfVectorizer().fit(ltable['title'].append(rtable['title']))
ltable_title_vec = vectorizer.transform(ltable['title'])
rtable_title_vec = vectorizer.transform(rtable['title'])

# Merge the training and test sets with ltable and rtable
train = train_data.merge(ltable, on='ltable_id').merge(rtable, on='rtable_id', suffixes=('_l', '_r'))
test = test_data.merge(ltable, on='ltable_id').merge(rtable, on='rtable_id', suffixes=('_l', '_r'))

# Calculate cosine similarity for 'title'
train['title_cosine_sim'] = [cosine_similarity(ltable_title_vec[i], rtable_title_vec[j])[0][0]
                             for i, j in zip(train['ltable_id'], train['rtable_id'])]
test['title_cosine_sim'] = [cosine_similarity(ltable_title_vec[i], rtable_title_vec[j])[0][0]
                            for i, j in zip(test['ltable_id'], test['rtable_id'])]

# Additional features (e.g., exact match on 'brand' and 'category')
train['brand_match'] = (train['brand_l'] == train['brand_r']).astype(int)
test['brand_match'] = (test['brand_l'] == test['brand_r']).astype(int)
train['category_match'] = (train['category_l'] == train['category_r']).astype(int)
test['category_match'] = (test['category_l'] == test['category_r']).astype(int)



  vectorizer = TfidfVectorizer().fit(ltable['title'].append(rtable['title']))


In [8]:
# Define feature columns
feature_columns = ['title_cosine_sim', 'brand_match', 'category_match']
X_train = train[feature_columns]
y_train = train['label']
X_test = test[feature_columns]
y_test = test['label']


KeyError: 'label'

In [None]:
# Step 5: Train the Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Step 6: Evaluate the Model
y_pred = model.predict(X_test)

In [None]:
# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))