In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import numpy as np

import nltk

nltk.download('punkt')

TRAIN_DATA_PATH = "./data/train.csv"
TEST_DATA_PATH = "./data/test.csv"

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bahk_insung/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Load Datasets

In [12]:
train_df = pl.read_csv(TRAIN_DATA_PATH, separator=",", has_header=True)
train_df

Unnamed: 0_level_0,description_x,description_y,ticker_x,ticker_y,same_security
i64,str,str,str,str,bool
0,"""first trust do…","""first trust dj…","""FDN""","""FDN""",true
1,"""schwab intl la…","""schwab strateg…","""FNDF""","""FNDF""",true
2,"""vanguard small…","""vanguard small…","""VSMAX""","""VSCIX""",false
3,"""duke energy co…","""duke energy co…","""DUK""","""DUK""",true
4,"""visa inc class…","""visa inc.""","""V""","""V""",true
5,"""ford motor co …","""ford motor co""","""F""","""F""",true
6,"""united states …","""united sts stl…","""X""","""X""",true
7,"""vanguard total…","""vanguard total…","""BNDX""","""BNDX""",true
8,"""schwab strateg…","""schwab us smal…","""SCHA""","""SCHA""",true
9,"""mfs value fd c…","""mfs value fund…","""MEIIX""","""MEIIX""",true


In [13]:
test_df = pl.read_csv(TEST_DATA_PATH, separator=",", has_header=True)
test_df

test_id,description_x,description_y,same_security
i64,str,str,str
0,"""semtech corp""","""semtech corpor…",
1,"""vanguard mid c…","""vanguard midca…",
2,"""spdr gold trus…","""spdr gold trus…",
3,"""vanguard total…","""vanguard total…",
4,"""oakmark intern…","""oakmark intern…",
5,"""pfizer inc div…","""pfizer inc com…",
6,"""spartan global…","""sptn glb xus i…",
7,"""vanguard total…","""vanguard total…",
8,"""banco latinoam…","""banco latinoam…",
9,"""baidu inc fadr…","""baidu inc spon…",


# Train & Test Datasets

In [15]:
X_train = train_df['description_x'].to_numpy()
X_train = [nltk.word_tokenize(x) for x in X_train]
X_train = np.array(X_train)

y_train = train_df['description_y'].to_numpy()
y_train = [nltk.word_tokenize(x) for x in y_train]
y_train = np.array(y_train)

print(f"""X_train: {X_train.shape}
y_train: {y_train.shape}""")

X_train: (2142,)
y_train: (2142,)


  X_train = np.array(X_train)
  y_train = np.array(y_train)


In [16]:
X_train

array([list(['first', 'trust', 'dow', 'jones', 'internet']),
       list(['schwab', 'intl', 'large', 'company', 'index', 'etf']),
       list(['vanguard', 'small', 'cap', 'index', 'adm']), ...,
       list(['pimco', 'investment', 'grade', 'corporate', 'bond', 'fund', '-', 'class', 'a']),
       list(['eli', 'lilly', '&', 'co', 'com']),
       list(['dfa', 'comm', 'strategy', 'i'])], dtype=object)

In [6]:
X_test = test_df['description_x'].to_numpy()
y_test = test_df['description_y'].to_numpy()
print(f"""X_test: {X_test.shape}
y_test: {y_test.shape}""")

X_test: (516,)
y_test: (516,)


# Data Vectorizing

In [30]:
train_vector = vectorizer.fit_transform(X_train, y_train).toarray()
print(train_vector.shape)

(2142, 1420)


In [33]:
test_vector = vectorizer.transform(X_test)
print(test_vector)

  (0, 372)	1
  (1, 289)	1
  (1, 657)	1
  (1, 842)	1
  (1, 1335)	1
  (2, 580)	2
  (2, 1129)	1
  (2, 1172)	1
  (2, 1284)	1
  (3, 112)	1
  (3, 252)	1
  (3, 657)	1
  (3, 1262)	1
  (3, 1335)	1
  (4, 326)	1
  (4, 553)	1
  (4, 690)	1
  (4, 917)	1
  (5, 31)	1
  (5, 416)	1
  (5, 649)	1
  (5, 973)	1
  (6, 118)	1
  (6, 324)	1
  (6, 495)	1
  :	:
  (511, 61)	1
  (511, 191)	1
  (511, 343)	1
  (511, 649)	1
  (511, 722)	1
  (511, 1114)	1
  (512, 252)	1
  (512, 485)	1
  (512, 657)	1
  (512, 690)	1
  (512, 1262)	1
  (512, 1335)	1
  (513, 479)	1
  (513, 619)	1
  (513, 728)	1
  (513, 1390)	1
  (514, 119)	1
  (514, 252)	1
  (514, 326)	1
  (514, 553)	1
  (514, 577)	1
  (514, 1237)	1
  (515, 617)	1
  (515, 1100)	1
  (515, 1283)	1
