In [0]:
!pip install -q sparsesvd numpy nltk pymystem3 

In [0]:
import json
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelSpreading
import pymystem3
import itertools
import nltk
from nltk.corpus import stopwords
from string import punctuation

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
SAMPLE_DATA = "/content/drive/My Drive/Colab Notebooks/cluster_final_cut_train.json"
DATA_FILE = "/content/drive/My Drive/Colab Notebooks/cosmo_content_storage_final_cut.jsonl"

nltk.download('stopwords')
russian_stopwords = stopwords.words('russian')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
mystem = pymystem3.Mystem() 
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords and token != " "  and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text

In [0]:
def validate_with_mappings(preds, target, ids, n_clusters):
    if len(preds) != len(ids):
        print("pred len not eq ids len")
    mapping = {}
    alredy_created = []
    for i in range(n_clusters):
        for pair in zip(preds,ids):
            if i == pair[0] and pair[1] not in alredy_created:
                mapping.update({i:pair[1]})
                alredy_created.append(pair[1])

    if len(mapping.keys()) != n_clusters:
        print(f"Fail {len(mapping.keys()) != n_clusters}")
        import pdb; pdb.set_trace()
    mapped_preds = [mapping[pred] for pred in preds]
    print(accuracy_score(mapped_preds, target))

    # return np.max(accuracy_history)

def get_train_texts(data_map, labels):
    texts = []
    for label in labels:
        values = data_map.get(label, None)
        if values:
            title = values.get('title', "")
            descr = values.get('description', '')
            texts.append(title + " " + descr)

    return texts

def get_train_data(data_map, labeled_id):
  texts = []
  text_labels = []
  for i in data_map:
    title = data_map[i].get('title', "")
    descr = data_map[i].get('description', '')
    texts.append(title + " " + descr)
    if labeled_id.get(i) != None:
      text_labels.append(labeled_id.get(i))
    else:
      text_labels.append(-1)

  return texts, text_labels

    

In [0]:
sample_id_to_target = json.load(open(SAMPLE_DATA))
train_text_id = sample_id_to_target.keys()

data_ = {}
print("Start textnormalization")
all_texts = []
with open(DATA_FILE) as fp:
    line = fp.readline()
    cnt = 1
    while line:
      #if cnt % 100 == 0:      
      print(f"Processed {cnt}: files")
      print(line)
        
      parsed_line = json.loads(line)
      doec_id = parsed_line["doc_id"]
      parsed_line.pop("doc_id")

      if parsed_line.get('description') != None:
        parsed_line['description'] = preprocess_text(parsed_line['description'])

      if parsed_line.get('title') != None:
        parsed_line['title'] = preprocess_text(parsed_line['title'])

      #descr = parsed_line.get('description', '')

      data_.update({str(doec_id): parsed_line})
      line = fp.readline()
      cnt += 1

train_texts = get_train_texts(data_, train_text_id)
train_text_id = [sample_id_to_target[x] for x in train_text_id]
train_x, train_y = get_train_data(data_, sample_id_to_target)

print("TfidfVectorizer ...")
vectorizer = TfidfVectorizer(max_df=500, min_df=8, stop_words=russian_stopwords)
matrix = vectorizer.fit_transform(train_x)
train_matrix = vectorizer.transform(train_texts)
print(matrix.shape)


Start textnormalization
Processed 1: files
{"url": "https://vz.ru/news/2019/12/6/1012187.html", "doc_id": 1000029981939875422, "description": "У России есть необходимые конкурентные преимущества, чтобы стать лидером на рынке водородной энергетики, заявил глава Минэнерго страны Александр Новак. В статье для журнала «Энергетическая политика» он заявил, что «в будущем водород в качестве источника энергии может сыграть одну из ключевых ролей в обеспечении мировых потребностей в чистой и доступной энергии». У России для того, чтобы занять лидирующие позиции в этой области, есть «резервы производственных мощностей, географическая близость к потенциальным потребителям, наличие инфраструктуры транспортировки», передает РИА «Новости». Ведомство Новака уже ведет работу по организации использования водородной энергетики, планируется составить дорожную карту развития. Новак также сообщил, что по итогам 2019 года потребление электроэнергии, вероятно, превысит уровень 2018 года на 0,46%, то есть выр

In [0]:
print(train_y)

[-1, -1, -1, 3059, 57, -1, -1, 425, 825, 2272, 1886, -1, 1076, -1, -1, -1, 602, 178, -1, -1, 1398, -1, 1261, 407, 549, -1, -1, 237, -1, -1, 583, -1, 54, -1, -1, -1, 26, -1, -1, -1, -1, 165, -1, 81, 1619, 2423, -1, -1, -1, 2160, -1, -1, -1, 1118, 965, -1, -1, 1497, -1, -1, -1, -1, -1, -1, 2890, -1, -1, -1, -1, 1823, 269, -1, 263, -1, -1, -1, -1, -1, -1, -1, -1, 367, -1, -1, 1210, -1, 373, -1, 1532, -1, 34, -1, -1, 389, -1, -1, 42, -1, 680, -1, -1, -1, -1, 2837, -1, -1, 391, -1, 62, -1, -1, -1, 791, -1, -1, -1, -1, -1, -1, 1933, -1, -1, -1, -1, 1070, 2918, -1, -1, -1, -1, -1, 109, -1, 374, 477, -1, -1, 863, -1, -1, -1, -1, -1, 2709, -1, -1, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 50, -1, 1417, -1, -1, -1, -1, -1, -1, -1, -1, 195, -1, -1, -1, 25, 1355, 1275, 103, -1, -1, -1, -1, 219, -1, -1, -1, -1, 1033, 1605, -1, -1, -1, -1, 2488, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1417, -1, -1, 1586, 1111, 2566, -1, -1, 1116, -1, -1, -1, 1010, -1, 1147, 91, 

In [0]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="hinge", penalty='l2', alpha=1e-3, random_state=42, max_iter=11, tol=None)
print("Train  SGDClassifier ...")
clf.fit(train_matrix, train_text_id)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=6,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [0]:
predictions = clf.predict(matrix)
print(predictions[3])

3059


In [0]:
result = {}
for n, i in enumerate(data_):
  if i in sample_id_to_target.keys():
    pass
  else:
    result.update({i : int(predictions[n])})

In [0]:
with open("/content/drive/My Drive/Colab Notebooks/ans-5.txt", "w") as f:
  f.write("doc_id,cat\n")
  for key in result.keys():
    f.write("%s,%s\n"%(key,result[key]))

In [0]:
label_prop_model = LabelSpreading(n_jobs=3)
svd = TruncatedSVD(n_components=1000, random_state=44)
features = svd.fit_transform(train_matrix)

In [0]:
rng = np.random.RandomState(42)
rundom_unlabeled_points = rng.rand(len(train_text_id)) < 0.3
labels = np.copy(train_text_id)
labels[rundom_unlabeled_points] = -1
label_prop_model.fit(features, labels)
preds = label_prop_model.predict(features)
print(accuracy_score(preds, train_text_id))

0.9922670690305545


In [0]:
#labels = np.copy(train_y)
label_prop_model.fit(features, train_text_id)

LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_jobs=3,
               n_neighbors=7, tol=0.001)

In [0]:
label_prop_model.fit(features[20000:45000], train_y[20000:45000])

LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_jobs=2,
               n_neighbors=7, tol=0.001)

In [0]:
target_features = svd.fit_transform(matrix)
preds_part_1 = label_prop_model.predict(target_features[0:45000])

In [0]:
preds_part_2 = label_prop_model.predict(target_features[45000:])

In [0]:
result = {}
for n, i in enumerate(data_):
  if i in train_text_id:
    #result.update({i : int(sample_id_to_target[i])})
    pass
  elif n < 45000:
    result.update({i : int(predictions[n])})
  else :
    result.update({i : int(preds_part_2[n-45000])})

In [0]:
print(train_text_id)
print(len(result))

[516, 2054, 328, 2133, 2266, 1036, 632, 950, 697, 1461, 205, 2388, 1618, 103, 527, 870, 693, 1471, 534, 1885, 395, 10, 52, 269, 673, 67, 182, 333, 392, 2219, 2524, 135, 1861, 424, 57, 67, 630, 349, 2707, 103, 150, 362, 953, 136, 47, 955, 2375, 237, 694, 90, 2625, 236, 665, 2483, 390, 2491, 891, 344, 449, 1612, 503, 433, 742, 2244, 1931, 2836, 771, 424, 1200, 1069, 1986, 166, 403, 1449, 395, 1498, 1214, 279, 2104, 54, 182, 624, 1760, 721, 490, 2437, 1605, 2442, 1922, 195, 1145, 1526, 2715, 534, 392, 103, 1022, 1837, 983, 977, 473, 2336, 71, 286, 1271, 1640, 2404, 244, 2718, 878, 657, 268, 141, 215, 1934, 2453, 2307, 251, 765, 711, 2228, 2770, 1727, 1298, 1139, 849, 90, 1954, 349, 42, 2304, 416, 1357, 1404, 359, 1151, 2523, 1825, 1083, 637, 867, 167, 1052, 87, 1314, 103, 908, 1539, 706, 1267, 637, 1448, 42, 42, 27, 1855, 2336, 2332, 1821, 1022, 1682, 301, 1527, 2247, 1476, 1836, 689, 2273, 1068, 180, 1105, 467, 3047, 740, 138, 2976, 511, 808, 637, 2260, 231, 1984, 248, 219, 3017, 147, 23

In [0]:
print(len([x for x in result if result[x] != 195]))

2215
