In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [10]:
import tensorflow as tf
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
print(tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
2.2.0


In [3]:
df_en_test = pd.read_csv('/content/drive/My Drive/abbyy/en_test')
df_ru_test = pd.read_csv('/content/drive/My Drive/abbyy/ru_test')
df_en_train = pd.read_csv('/content/drive/My Drive/abbyy/en_train')
df_ru_train = pd.read_csv('/content/drive/My Drive/abbyy/ru_train')

In [4]:
df_en_test.values.shape

(422, 3)

In [5]:
target_names = np.unique(df_en_train.target)
print(target_names)

['A1' 'A11' 'A12' 'A14' 'A16' 'A17' 'A22' 'A4' 'A7' 'A8' 'A9']


In [6]:
X_train_ru, X_test_ru = df_ru_train['text'].values, df_ru_test['text'].values
y_train_ru, y_test_ru = df_ru_train['target'].values, df_ru_test['target'].values
X_train_en, X_test_en = df_en_train['text'].values, df_en_test['text'].values
y_train_en, y_test_en = df_en_train['target'].values, df_en_test['target'].values

### Keyword extraction and writing them to csv files

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import string

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    return results

def preprocess(text):
  lowercased = text.translate(str.maketrans("", "", string.punctuation)).lower()
  return ''.join([c for c in lowercased if not c.isdigit()])

def get_keywords_df(X_train, y_train, stop_words):
  cv = CountVectorizer(max_df=0.8, stop_words=stop_words)
  word_count_vector = cv.fit_transform(X_train)
  tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
  tfidf_transformer.fit(word_count_vector)

  topics_doc = {}
  for text, label in zip(X_train, y_train):
    if label not in topics_doc:
      topics_doc[label] = text
    else:
      topics_doc[label] += " " + text

  keywords_df = pd.DataFrame()
  feature_names = cv.get_feature_names()

  for topic_label, topic_text in topics_doc.items():
    topic_text = preprocess(topic_text)
    tf_idf_vector = tfidf_transformer.transform(cv.transform([topic_text]))
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords = extract_topn_from_vector(feature_names, sorted_items, 20)

    print("\n=====Doc=====")
    print(topic_text[:100])
    print("\n===Keywords===")
    for keyword in keywords:
        keywords_df = keywords_df.append({'keyword': keyword, 'topic': topic_label}, ignore_index=True)
        print(keyword, keywords[keyword])
  return keywords_df

In [57]:
keywords_ru = get_keywords_df(X_train_ru, y_train_ru, stopwords.words("russian") + ["это"])


=====Doc=====
в ответ на  европейское турне  партии регионов тимошенко обратится в суд  украина  экспремьерминистр

===Keywords===
заявил 0.235
президент 0.182
года 0.162
млрд 0.149
россии 0.143
сша 0.141
украины 0.128
году 0.124
газа 0.12
также 0.112
рф 0.106
президента 0.101
словам 0.097
сообщает 0.093
сказал 0.091
глава 0.09
изза 0.088
долларов 0.086
страны 0.085
власти 0.08

=====Doc=====
электронная книга wexler t компания wexler объявляет о старте продаж новой электронной книги wexler 

===Keywords===
мм 0.207
волос 0.162
позволяет 0.143
благодаря 0.136
компания 0.118
линзы 0.11
модель 0.105
компании 0.104
также 0.095
качества 0.086
обеспечивает 0.076
высокой 0.076
система 0.073
вес 0.071
линз 0.07
кг 0.069
продукции 0.068
является 0.066
двери 0.065
модели 0.065

=====Doc=====
комиссия по правам человека шестидесятая сессия пункт  повестки дня вопрос о нарушении прав человека

===Keywords===
объединенных 0.14
наций 0.14
организации 0.14
миростроительству 0.133
безопасности 0.132

In [58]:
keywords_ru.head()

Unnamed: 0,keyword,topic
0,заявил,A8
1,президент,A8
2,года,A8
3,млрд,A8
4,россии,A8


In [59]:
keywords_ru.to_csv('/content/drive/My Drive/abbyy/keywords_ru.csv')

In [60]:
keywords_en = get_keywords_df(X_train_en, y_train_en, stopwords.words("english"))


=====Doc=====
exchange rate disorder new york – two troubling features of the ongoing economic recovery are the de

===Keywords===
peacebuilding 0.2
children 0.168
nt 0.152
international 0.143
council 0.139
united 0.138
nations 0.136
commission 0.133
security 0.125
countries 0.111
development 0.109
conflict 0.108
would 0.097
states 0.096
state 0.095
political 0.094
people 0.092
armed 0.091
also 0.086
world 0.085

=====Doc=====
do you remember the debut of napster  say goodbye to cds and tape cassettes  for those that still ha

===Keywords===
taft 0.307
convention 0.231
committee 0.222
parties 0.2
secretariat 0.168
meeting 0.137
sce 0.116
shall 0.115
mine 0.113
conference 0.112
states 0.103
article 0.093
roosevelt 0.086
azerbaijan 0.084
stockholm 0.082
station 0.075
would 0.075
united 0.075
pops 0.072
scottish 0.07

=====Doc=====
have you heard about formbased codes  new zoning tool to be utilized with plan build live cincinnati

===Keywords===
nt 0.237
cialis 0.161
viagra 0.148
hotel 

In [61]:
keywords_en.head()

Unnamed: 0,keyword,topic
0,peacebuilding,A1
1,children,A1
2,nt,A1
3,international,A1
4,council,A1


In [62]:
keywords_en.to_csv('/content/drive/My Drive/abbyy/keywords_en.csv')

### Finding of the most frequent words and writing them to file

In [20]:
def write_most_frequent_words(X_train, stopwords, filename, count=50):
  count_dict = dict()
  for text in X_train:
    prepr_text = preprocess(text)
    for word in prepr_text.split():
      if word not in count_dict:
        count_dict[word] = 0
      if word not in stopwords:
        count_dict[word] += 1
  sorted_dict = sorted([(count, word) for word, count in count_dict.items()], reverse=True)[:count]
  
  with open(filename, 'w') as fout:
    for _, word in sorted_dict:
      fout.write(word + "\n")

In [21]:
write_most_frequent_words(X_train_ru, stopwords.words("russian") + ["это"], '/content/drive/My Drive/abbyy/frequent_words_ru')
write_most_frequent_words(X_train_en, stopwords.words("english"), '/content/drive/My Drive/abbyy/frequent_words_en')

In [22]:
!cat '/content/drive/My Drive/abbyy/frequent_words_ru'

—
«
»
также
–
которые
года
время
м
является
организации
который
очень
г
российской
федерации
могут
br
лет
россии
т
соответствии
которых
работы
других
человека
связи
этих
области
случае
однако
безопасности
году
“
сша
образом
”
своей
жизни
деятельности
которая
развития
имеет
например
должны
системы
поэтому
статьи
наций
статья
