In [1]:
import numpy as np
import pandas as pd

import string
import re
import gensim

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

STOP_WORDS = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikhailzaytsev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mikhailzaytsev/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikhailzaytsev/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikhailzaytsev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_csv('abstracts.csv', header=None) # загрузим данные
df.head()

Unnamed: 0,0
0,"""An increasing number of high-performance netw..."
1,Precision and accuracy of portable meter Accut...
2,We consider the following nonlinear Schrödinge...
3,"""High-throughput assays for enzyme catalysis t..."
4,"""Three alternative routes, using the heterobif..."


In [9]:
list(df.iloc[0]) # пример элемента датасета

['"An increasing number of high-performance networks provision dedicated channels through circuit switching or MPLS/GMPLS techniques to support large data transfer. The link bandwidths in such networks are typically shared by multiple users through advance reservation, resulting in varying bandwidth availability in future time. Developing efficient scheduling algorithms for advance bandwidth reservation has become a critical task to improve the utilization of network resources and meet the transport requirements of application users. We consider an exhaustive combination of different path and bandwidth constraints and formulate four types of advance bandwidth scheduling problems, with the same objective to minimize the data transfer end time for a given transfer request with a prespecified data size: 1) fixed path with fixed bandwidth (FPFB); 2) fixed path with variable bandwidth (FPVB); 3) variable path with fixed bandwidth (VPFB); and 4) variable path with variable bandwidth (VPVB). 

In [11]:
short_idx = np.where(df.iloc[:,0].apply(len)<100)[0] # выделим тексты короче 100 символов
df.drop(index=short_idx, inplace=True) # удалим такие тексты
df.reset_index(drop=True, inplace=True)

In [20]:
# Функции для предобработки текста

def remove_nonASCII(text):
    """Удаляем non-ASCII символы"""
    cleaned_text = ''.join([x for x in text if x in string.printable])
    return cleaned_text

def remove_URL(text):
    """Удаляем ссылки"""
    url = re.compile(r'http\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    """Удаляем знаки препинания"""
    return re.sub(r'[^\w\s]', '', text)

def remove_individ_letters(text):
    """Удаляем одиночные буквы"""
    return re.sub(r'\b\w.?\b','', text) 

def remove_numbers(text):
    """Удаляем цифры"""
    return re.sub(r'\d*','', text) 

def remove_stop_words(text):
    """Удаление стоп слов"""
    new_sent = ' '.join([i for i in text.split() if i not in STOP_WORDS])
    return new_sent

def process_text(text):
    text = remove_nonASCII(text)
    text = remove_URL(text)
    text = remove_numbers(text)
    text = remove_punct(text)
    text = remove_individ_letters(text)
    return text

def get_wordnet_pos(word):
    """Переведем тэги с nltk.pos_tag в тэги, которые принимает lemmatize() метод. 
    Если такого тэга нет, то по умолчанию возвращается существительное.""" 
    tag = nltk.pos_tag([word])[0][1][0].upper() # выделяем тэг слова
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # переводим все слова к соответствующей форме, либо к существительному

def clean_text(text, lemmatize=True, remove_stopwords=True):
    """Очищаем текст"""
    if remove_stopwords:
        text = remove_stop_words(text.lower()) # удаляем стоп-слова и приводим все к нижнему регистру
    text = process_text(text) # очищаем текст
    if lemmatize:
        lemmatizer = WordNetLemmatizer() # лемматизируем
        return ' '.join([lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in word_tokenize(text)])
    return text

In [31]:
df_cleaned = df.iloc[:, 0].apply(clean_text)

In [33]:
list(df.iloc[0])

['"An increasing number of high-performance networks provision dedicated channels through circuit switching or MPLS/GMPLS techniques to support large data transfer. The link bandwidths in such networks are typically shared by multiple users through advance reservation, resulting in varying bandwidth availability in future time. Developing efficient scheduling algorithms for advance bandwidth reservation has become a critical task to improve the utilization of network resources and meet the transport requirements of application users. We consider an exhaustive combination of different path and bandwidth constraints and formulate four types of advance bandwidth scheduling problems, with the same objective to minimize the data transfer end time for a given transfer request with a prespecified data size: 1) fixed path with fixed bandwidth (FPFB); 2) fixed path with variable bandwidth (FPVB); 3) variable path with fixed bandwidth (VPFB); and 4) variable path with variable bandwidth (VPVB). 

In [40]:
df_cleaned.iloc[0]

'increase number highperformance network provision dedicate channel circuit switch mplsgmpls technique support large data transfer link bandwidth network typically share multiple user advance reservation result vary bandwidth availability future time develop efficient schedule algorithm advance bandwidth reservation become critical task improve utilization network resource meet transport requirement application user consider exhaustive combination different path bandwidth constraint formulate four type advance bandwidth schedule problem objective minimize data transfer end time give transfer request prespecified data size fix path fix bandwidth fpfb fix path variable bandwidth fpvb variable path fix bandwidth vpfb variable path variable bandwidth vpvb vpfb vpvb consider two subcases path switch delay negligible nonnegligible propose optimal algorithm schedule problem except fpvb vpvb nonnegligible path switch delay proven npcomplete nonapproximable tackle heuristic performance superior