# Doc2Vec

## Reading Data from CSV

In [2]:
import pandas as pd

csv_file = "../../data/arxiv.csv"
data = pd.read_csv(csv_file, sep=";")

## Text cleaning
1. Lowercasing - we convert all text to lowercase for uniformity,
2. Remove punctuation - we eliminate unnecessary punkctuation marks
3. Remove numbers - we are modelling topics from the text so numbers do not have any meaning in our case.
4. Remove stopwords - stop words are very common words that carry no meaning or less meaning compared to other words. If we remove the words that are less commonly used, we can focus on the important words instead.
6. Lemmatization - we extract the semantic root of a word (lemma) by considering the vocabulary and connections between the meanings of various words

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/mstrzezon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mstrzezon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mstrzezon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mstrzezon/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation, numbers and stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

In [6]:
data['Processed Summary'] = data.apply(lambda row: preprocess_text(row['Summary']), axis=1)

In [7]:
data.head(10)

Unnamed: 0,ID,Title,Summary,Published,PDF Link,Flesch reading ease,Number of words,Processed Summary
0,http://arxiv.org/abs/cs/0002002v1,Uniform semantic treatment of default and auto...,We revisit the issue of connections between tw...,2000-02-03T21:44:57Z,http://arxiv.org/pdf/cs/0002002v1,26.3,195,"[revisit, issue, connection, two, leading, for..."
1,http://arxiv.org/abs/cs/0002003v1,On the accuracy and running time of GSAT,Randomized algorithms for deciding satisfiabil...,2000-02-04T12:53:57Z,http://arxiv.org/pdf/cs/0002003v1,35.47,188,"[randomized, algorithm, deciding, satisfiabili..."
2,http://arxiv.org/abs/cs/0002009v1,Syntactic Autonomy: Why There is no Autonomy w...,Two different types of agency are discussed ba...,2000-02-16T18:09:20Z,http://arxiv.org/pdf/cs/0002009v1,22.85,160,"[two, different, type, agency, discussed, base..."
3,http://arxiv.org/abs/cs/0003008v1,Consistency Management of Normal Logic Program...,This paper presents a method of computing a re...,2000-03-05T10:29:03Z,http://arxiv.org/pdf/cs/0003008v1,45.69,127,"[paper, present, method, computing, revision, ..."
4,http://arxiv.org/abs/cs/0003016v1,Abductive and Consistency-Based Diagnosis Revi...,Diagnostic reasoning has been characterized lo...,2000-03-07T11:39:53Z,http://arxiv.org/pdf/cs/0003016v1,17.3,110,"[diagnostic, reasoning, characterized, logical..."
5,http://arxiv.org/abs/cs/0003020v2,ACLP: Integrating Abduction and Constraint Sol...,ACLP is a system which combines abductive reas...,2000-03-07T22:47:13Z,http://arxiv.org/pdf/cs/0003020v2,21.33,171,"[aclp, system, combine, abductive, reasoning, ..."
6,http://arxiv.org/abs/cs/0003021v1,Relevance Sensitive Non-Monotonic Inference on...,We present a method for relevance sensitive no...,2000-03-08T03:03:36Z,http://arxiv.org/pdf/cs/0003021v1,0.15,143,"[present, method, relevance, sensitive, infere..."
7,http://arxiv.org/abs/cs/0003024v1,A Compiler for Ordered Logic Programs,"This paper describes a system, called PLP, for...",2000-03-08T10:15:51Z,http://arxiv.org/pdf/cs/0003024v1,38.66,120,"[paper, describes, system, called, plp, compil..."
8,http://arxiv.org/abs/cs/0003028v1,Logic Programs with Compiled Preferences,We describe an approach for compiling preferen...,2000-03-08T14:09:56Z,http://arxiv.org/pdf/cs/0003028v1,37.03,179,"[describe, approach, compiling, preference, lo..."
9,http://arxiv.org/abs/cs/0003031v1,Optimal Belief Revision,We propose a new approach to belief revision t...,2000-03-08T15:54:50Z,http://arxiv.org/pdf/cs/0003031v1,31.92,178,"[propose, new, approach, belief, revision, pro..."


## Save the processed text

In [18]:
data.to_csv("../../data/arxiv_processed.csv", sep=";", index=False)