### Loading the dataset

In [3]:
import pandas as pd

with open('DBLPTrainset.txt', 'r') as file:
    lines = file.readlines()

data = []
for line in lines:
    _, conference, topic = line.strip().split('\t')
    data.append({'Conference': conference, 'Topic': topic})

data = pd.DataFrame(data)
print(data.head())

  Conference                                              Topic
0      ISCAS  Scalable Serial-parallel Multiplier over GF(2m...
1   SIGGRAPH                                Plenoptic sampling.
2      ISCAS  Sensitivity and uniformity of a 0.18micrometer...
3        WWW      A survey of web archive search architectures.
4      ISCAS  Understanding dynamic behavior of mm-wave CML ...


In [4]:
X = data['Topic']
y = data['Conference']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Pre-Processing

In [7]:
# 1. Lowercasing

X_train = X_train.str.lower()
X_test = X_test.str.lower()

In [8]:
# 2. Tokenization

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

X_train = X_train.apply(word_tokenize)
X_test = X_test.apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rames\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [10]:
# 3. Removing punctuation and special characters

import string

def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

X_train = X_train.apply(remove_punctuation)
X_test = X_test.apply(remove_punctuation)

In [12]:
# 4. Removing stop words

from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rames\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [13]:
# 5. Stemming and Lemmatization

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

def stem_words(tokens):
    return [porter_stemmer.stem(word) for word in tokens]

def lemmatize_words(tokens):
    return [wordnet_lemmatizer.lemmatize(word) for word in tokens]

X_train = X_train.apply(stem_words)
X_test = X_test.apply(stem_words)

X_train = X_train.apply(lemmatize_words)
X_test = X_test.apply(lemmatize_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rames\AppData\Roaming\nltk_data...


In [14]:
X_train[:5]

4307     [use, graphic, processor, high, perform, ir, q...
17179    [two-tier, load, balanc, ospf, wireless, back-...
15486    [cach, mechan, improv, internet, base, mobil, ...
4605             [rail-to-rail, tunabl, cmo, v-i, convert]
9501                [practic, skew, handl, parallel, join]
Name: Topic, dtype: object