In [3]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from nltk.stem.porter import PorterStemmer
from collections import Counter
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
pip install gdown

Collecting gdown
  Downloading gdown-4.5.3.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.5.3-py3-none-any.whl size=14821 sha256=e360430b915614f5cadf5c243a96fb9e5eae1cc6c9da85880cbe601b5b3e2297
  Stored in directory: /root/.cache/pip/wheels/94/8d/0b/bdcd83555c3555f91a33f6c2384428d9f163c7d75ab0d272b4
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.5.3
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
import gdown
url = 'https://github.com/MADE-TEAM-PROJECT-1/clusterization_model/raw/tfidf%2Bsvd%2Bkmeans/labels.csv'
gdown.download(url, fuzzy=False)

Downloading...
From: https://github.com/MADE-TEAM-PROJECT-1/clusterization_model/raw/tfidf%2Bsvd%2Bkmeans/labels.csv
To: /kaggle/working/labels.csv
30.0MB [00:00, 167MB/s]                    


'labels.csv'

In [6]:
df = pd.read_csv('/kaggle/working/labels.csv')

In [7]:
url = 'https://drive.google.com/u/0/uc?id=1yjeG6-kIpjoxFA75M5wUuHlsmJhpGnfw&export=download'
gdown.download(url, fuzzy=False)

Downloading...
From: https://drive.google.com/u/0/uc?id=1yjeG6-kIpjoxFA75M5wUuHlsmJhpGnfw&export=download
To: /kaggle/working/data.zip
100%|██████████| 546M/546M [00:02<00:00, 233MB/s] 


'data.zip'

In [8]:
!7z x /kaggle/working/data.zip


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=C.UTF-8,Utf16=on,HugeFiles=on,64 bits,4 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /kaggle/workin                        1 file, 546094231 bytes (521 MiB)

Extracting archive: /kaggle/working/data.zip
--
Path = /kaggle/working/data.zip
Type = zip
Physical Size = 546094231

      1% 5 - data/part_1_clean.jso                                2% 5 - data/part_1_clean.jso                                3% 5 - data/part_1_clean.jso                                4% 5 - data/part_1_clean.jso                                6% 5 - data/part_1_clean.jso                                7% 5 - data/part_1_clean.jso                                8% 5 - data/part_1_clean.jso                                9% 5 - data/part_1_clean.jso                               11% 5 - data/part_1_clean.jso                               12% 5 - data/part_1_clean.j

In [9]:
df.head()

Unnamed: 0,_id,label
0,53e99784b7602d9701f3e151,23
1,53e99784b7602d9701f3e15d,21
2,53e99784b7602d9701f3f411,13
3,53e99792b7602d9701f5af1a,34
4,53e99792b7602d9701f5b0a5,35


In [10]:
COLUMNS_TO_DROP = ['year', 'n_citation', 'references', 'authors']
NUM_PARTS = 3
RANDOM_STATE = 42
REDUCED_DIMENSION = 20
NUM_CLUSTERS = 50

def plot_clusters(cluster_ids, cluster_sizes, cluster_names):
    plt.figure(figsize=(18, 7))
    plt.bar(cluster_ids, cluster_sizes)
    plt.xlabel('Cluster id')
    plt.xticks(range(NUM_CLUSTERS))
    plt.title('Cluster sizes')
    ax = plt.gca()
    for bar, word in zip(ax.patches, cluster_names):
        ax.text(bar.get_x() + bar.get_width() / 4, bar.get_y() + bar.get_height()*1.05, word, rotation='vertical', color = 'black', va = 'bottom', fontsize=10) 
    plt.show()

def get_score(X, cluster_labels, sample_size=10000):
    return silhouette_score(X, pipe['kmeans'].labels_, sample_size=sample_size, random_state=RANDOM_STATE)

def get_text_data(file_path):
    data = pd.read_json(file_path, dtype={'title': 'string', 'abstract': 'string'}).drop(COLUMNS_TO_DROP, axis=1)
    data['keywords'] = data['keywords'].apply(lambda row: ' '.join(row)).astype('string')
    data['fos'] = data['fos'].apply(lambda row: ' '.join(row)).astype('string')
    data['venue'] = data['venue'].apply(lambda row: row.get('name_d', '')).astype('string')
    
    data['text'] = data[['keywords', 'fos', 'abstract']].apply(lambda row: ' '.join(row.astype(str)), axis=1).astype('string')
    data.drop(['title', 'abstract', 'venue'], axis=1, inplace=True)
    return data

In [11]:
articles = pd.concat(get_text_data(f'data/part_{i+1}_clean.json') for i in range(NUM_PARTS))
#articles = pd.read_json(get_text_data('/input/articles/part_3_clean.json'))
articles.reset_index(drop=True, inplace=True)
# articles = pd.read_json('articles.json')

articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084405 entries, 0 to 1084404
Data columns (total 4 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   _id       1084405 non-null  object
 1   keywords  1084405 non-null  string
 2   fos       1084405 non-null  string
 3   text      1084405 non-null  string
dtypes: object(1), string(3)
memory usage: 33.1+ MB


In [12]:
#articles_trunc = articles.loc[articles.index < 500000]

In [13]:
#df_articles = pd.merge(articles, df)
#df_articles.head()

In [14]:
%%time

def normalize(sentence):
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    return ' '.join(porter.stem(word) for word in sentence.split())

articles = articles.parallel_applymap(normalize)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2168812), Label(value='0 / 2168812…

CPU times: user 1min 2s, sys: 10.3 s, total: 1min 13s
Wall time: 46min 51s


In [15]:
articles

Unnamed: 0,_id,keywords,fos,text
0,53e99784b7602d9701f3e151,handwrit recognit prototyp imag segment comput...,intellig charact recognit pattern recognit com...,handwrit recognit prototyp imag segment comput...
1,53e99784b7602d9701f3e15d,sequenti circuit statist distribut set-up time...,delay calcul time failur mont carlo method seq...,sequenti circuit statist distribut set-up time...
2,53e99784b7602d9701f3f411,internet hypermedia markup languag inform reso...,xml base world wide web xml framework xml encr...,internet hypermedia markup languag inform reso...
3,53e99792b7602d9701f5af1a,cognit multi-ag system ubiquit comput adips-da...,syma comput scienc symbiot comput multi-ag sys...,cognit multi-ag system ubiquit comput adips-da...
4,53e99792b7602d9701f5b0a5,balanc scorecard,leas comput scienc balanc scorecard busi admin...,balanc scorecard leas comput scienc balanc sco...
...,...,...,...,...
1084400,6052d1c79e795e222b127d58,sketch inform retriev solid model task analysi...,train set 3d model task analysi inform retriev...,sketch inform retriev solid model task analysi...
1084401,6052fa249e795e222b13be15,hash function random oracl discret mathemat co...,discret mathemat time space comput scienc rand...,hash function random oracl discret mathemat co...
1084402,6054328e9e795e40330e1fa2,stereopsi comput vision daytim artifici intell...,network on comput vision comput scienc stereop...,stereopsi comput vision daytim artifici intell...
1084403,605828249e795e357b471758,search engin data as a servic autonom system (...,search engin situat awar comput scienc interne...,search engin data as a servic autonom system (...


In [16]:
# #vectorizer = CountVectorizer(max_df=0.7, min_df=15, ngram_range=(1, 2), stop_words='english')
# vectorizer = TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2), stop_words='english')
# svd = TruncatedSVD(n_components=REDUCED_DIMENSION, random_state=RANDOM_STATE)
# #kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=RANDOM_STATE)

vectorizer = CountVectorizer(max_df=0.4, min_df=15, ngram_range=(1, 2), stop_words='english')
svd = TruncatedSVD(n_components=REDUCED_DIMENSION, random_state=RANDOM_STATE)
#lr= LogisticRegression(..., n_jobs=-1, random_state=RANDOM_STATE)

pipe = Pipeline([
    ('cv', vectorizer),
    ('svd', svd)
    #('lr', lr)
])

In [17]:
%%time
pipe.fit(articles.text)

#cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)

articles_compressed = pipe.transform(articles.text)
print(articles_compressed.shape)

(1084405, 20)
CPU times: user 14min 9s, sys: 29.1 s, total: 14min 38s
Wall time: 14min 18s


In [18]:
articles_compressed

array([[ 1.88034934, -1.81758231,  2.18749301, ...,  0.54535266,
        -1.46390812,  0.7414943 ],
       [ 9.46467805, -2.41052907,  2.47283802, ..., -0.19659715,
         6.4791655 ,  6.37901079],
       [ 3.52039019, -1.69706344, -3.67305629, ...,  1.75894639,
        -0.55974357,  1.07961953],
       ...,
       [ 5.284444  ,  4.19135315,  2.96967699, ..., -1.17171912,
         1.75348914, -0.66394003],
       [ 7.8384693 , -1.05813383, -4.16587054, ...,  0.01757972,
         0.62331992, -1.23747047],
       [ 2.50348714, -1.23869986, -0.39138076, ...,  0.1697864 ,
        -0.92006701,  0.20827713]])

In [19]:
label = df['label'].to_numpy()

In [20]:
label

array([23, 21, 13, ...,  8, 45, 45])

In [21]:
from sklearn.model_selection import train_test_split

X = articles_compressed
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression(n_jobs = -1)

In [23]:
from sklearn.multiclass import OneVsRestClassifier
ovr_class = OneVsRestClassifier(logreg)
ovr_class.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(n_jobs=-1))

In [24]:
from sklearn.metrics import f1_score

y_pred = ovr_class.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print('Accuracy of logreg OneVsRest classifier on test set: {:.6f}'.format(ovr_class.score(X_test, y_test)))
print('Weighted F-1 score of logreg OneVsRest classifier on test set: {:.6f}'.format(f1))

Accuracy of logreg OneVsRest classifier on test set: 0.582810
Weighted F-1 score of logreg OneVsRest classifier on test set: 0.580681
