**Imports**

In [None]:
import json
from math import floor
from math import log
from random import shuffle
import pandas as pd
import numpy as np
import re
import nltk
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from numpy import ravel
from sklearn.metrics import accuracy_score

Filters `original_dataset.json` and removes everything that doesn't `cs.` in it. After that, creates new file

In [None]:
data = []
with open('original_dataset.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
filtered = [x for x in data if (' cs.' in x['categories'] or x['categories'][:3] == 'cs.' )]

with open('csDataset.json', 'w') as f:
    f.write(json.dumps(filtered, indent=2))

Splits dataset into two random parts. 80% for training and 20% for testing

In [None]:
with open('csDataset.json', 'r') as f:
    data = json.loads(f.read())
shuffle(data)
length = len(data)
train_data_size = floor(length * 80/100)
train_data = data[train_data_size:]

with open('trainDataset.json', 'w') as f:
    f.write(json.dumps(data[:train_data_size], indent=2))

with open('testDataset.json', 'w') as f:
    f.write(json.dumps(data[train_data_size:], indent=2))

**Creating topics array and dictionary**

In [None]:
d = []
d.append("cs.AI_Artificial Intelligence")
    
d.append("cs.CL_Computation and Language")
    
d.append("cs.CC_Computational Complexity")
    
d.append("cs.CE_Computational Engineering, Finance, and Science")
    
d.append("cs.CG_Computational Geometry")
    
d.append("cs.GT_Computer Science and Game Theory")
    
d.append("cs.CV_Computer Vision and Pattern Recognition")
    
d.append("cs.CY_Computers and Society")

d.append("cs.CR_Cryptography and Security")
    
d.append("cs.DS_Data Structures and Algorithms")
    
d.append("cs.DB_Databases")
    
d.append("cs.DL_Digital Libraries")
    
d.append("cs.DM_Discrete Mathematics")
    
d.append("cs.DC_Distributed, Parallel, and Cluster Computing")
    
d.append("cs.ET_Emerging Technologies")
    
d.append("cs.FL_Formal Languages and Automata Theory")

d.append("cs.GL_General Literature")

d.append("cs.GR_Graphics")

d.append("cs.AR_Hardware Architecture")

d.append("cs.HC_Human-Computer Interaction")

d.append("cs.IR_Information Retrieval")

d.append("cs.IT_Information Theory")

d.append("cs.LO_Logic in Computer Science")

d.append("cs.LG_Machine Learning")

d.append("cs.MS_Mathematical Software")

d.append("cs.MA_Multiagent Systems")

d.append("cs.MM_Multimedia")

d.append("cs.NI_Networking and Internet Architecture")

d.append("cs.NE_Neural and Evolutionary Computing")

d.append("cs.NA_Numerical Analysis")

d.append("cs.OS_Operating Systems")

d.append("cs.OH_Other Computer Science")

d.append("cs.PF_Performance")

d.append("cs.PL_Programming Languages")

d.append("cs.RO_Robotics")

d.append("cs.SI_Social and Information Networks")

d.append("cs.SE_Software Engineering")

d.append("cs.SD_Sound")

d.append("cs.SC_Symbolic Computation")

d.append("cs.SY_Systems and Control")

categoriesDict = {x.split('_')[0] : x.split('_')[1] for x in d}

categories = [x.split('_')[0] for x in d]


**Imports `trainJson` and `testJson` files into variables**

In [None]:
with open('trainDataset.json', 'r') as f:
    trainJson = pd.read_json(f.read())


**Cleans and takes keywords from the abstract of articles**

In [None]:
def cleanText(text):
    review = re.sub(r'\$.*?\$', '', text)
    review = re.sub('[^a-zA-Z-]', ' ', review)
    review = review.lower()
    review = review.split()
    stops = stopwords.words("english")
    ps = WordNetLemmatizer()
    review = [ps.lemmatize(word) for word in review if not word in set(stops)]
    review = ' '.join(review)
    return review

def cleaning(trainSet):
    corpus = []
    for ind, i in trainSet.iterrows():
        corpus.append(cleanText(i['title'] + " " + i['abstract']))
    return corpus


In [None]:
corpus = cleaning(trainJson)
corpus_test = cleaning(testJson)

Imports Corpuses

In [None]:
with open("corpus.pckl", 'rb') as f:
    corpus = pickle.load(f)

with open("corpus_test.pckl", 'rb') as f:
    corpus_test = pickle.load(f)

Create vectors depending on words corpus for each document

In [None]:
cc = [];
for i in range(0,40):
    cc.append([])

for ind, i in trainJson.iterrows():
    ii = i['categories'].split()
    t = cleanText(i['title'] + " " + i['abstract'])
    for j in ii:
        if(j[:3] == "cs."):
            cc[categories.index(j)].append(t)

In [None]:
xx = []
cvv = []

for i in range(0,40):
    xx.append([])
    cvv.append([])

for ind,i in enumerate(cc):
    cv = TfidfVectorizer(ngram_range=(1,2),max_df=0.8, max_features=40, stop_words='english', min_df=2)
    X = cv.fit_transform(i).toarray()
    xx[ind] = X
    cvv[ind] = cv

for i in cvv:
    print(i.get_feature_names_out())

**Imports TFIDFs, X and X_test**

In [None]:
import pickle
with open("x_upd.pckl",'rb') as f:
    xx = pickle.load(f)
with open("cv_upd.pckl",'rb') as f:
    cvv = pickle.load(f)

`Y` Dataset creation for classlabel

In [None]:

def createY(datas):
    y = []
    for i in range(0,len(datas.index)):
        y.append([])
    for ind, i in datas.iterrows():
        ii = i['categories'].split()
        for j in ii:
            if(j[:3] == "cs."):
                y[ind].append(categories.index(j))
    return y



After Update

In [None]:
from sklearn.neighbors import NearestNeighbors
from math import log

text = """
Artificial intelligence (AI) is intelligence - perceiving, synthesizing, and infering information - demonstrated by machines, as opposed to intelligence displayed by animals and humans. Example tasks in which this is done include speech recognition, computer vision, translation between (natural) languages, as well as other mappings of inputs. OED (OUP) defines artificial intelligence as:[1]

the theory and development of computer systems able to perform tasks that normally require human intelligence, such as visual perception, speech recognition, decision-making, and translation between languages.

AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix),
"""

def tfCalculator(arr, cv):
    freq={}
    
    for word in arr:
        if(word in freq):
            freq[word] += 1
        else:
            freq[word] = 1
    fet_arr = cv.get_feature_names_out()

    lena = 1
    for i in fet_arr:
        if i in arr:
            lena+=1

    for word in freq:
        freq[word] /= lena
    return freq

def newTFIDFArray(review,cv, X):
    arr = review.split()

    freq=tfCalculator(arr, cv)
    fet_arr = cv.get_feature_names_out()
    new_TFIDF = [10]*len(fet_arr)
    
    for word in arr: 
        jima = 1
        if(word in fet_arr):
            index = np.where(fet_arr == word)[0][0]
            
            for document in X:
                if document[index] != 0:
                    jima+=1
            
            idf = log(len(X)/jima,2)
            tfidf = freq[word] * idf
            new_TFIDF[index] = tfidf

    
    return np.array([new_TFIDF])


def kneigh(text, n, xx, cvv, categories, categoriesDict,trainJson):
    m = 1024
    c = 0
    aaa=[]
    for i in range(0,40):
        cv = cvv[i]
        X = xx[i]
        review = cleanText(text)

        shemogzavnili = newTFIDFArray(review, cv, X)
        
        memezoble = NearestNeighbors(n_neighbors=80).fit(np.append(X, shemogzavnili, axis=0))

        answer = memezoble.kneighbors(shemogzavnili, 80, return_distance=True)
  
        if(answer[0][0][1]!=10):
            kk = 0
            for p in range(1, 80):
                kk+=answer[0][0][p]
            kk=kk/19

            if(m>kk):
                m=kk
                c = i
                aaa = answer[1]
    
    predCat = categoriesDict[categories[c]]
    print('\n'+"The text is about " + predCat)
    print("related articles abstracts:\n")
    i = 1
    while(i < n+1):
        if(trainJson.iloc[aaa[0][i]]['license'] == None):
            i+=1
            continue
        if (int(trainJson.iloc[aaa[0][i]]['update_date'].split('-')[0]) < 2015):
            print("Warning, This article may be outdated ("+trainJson.iloc[aaa[0][i]]['update_date'].split('-')[0]+")\n")
        print(trainJson.iloc[aaa[0][i]]['title'] +'\n'+ trainJson.iloc[aaa[0][i]]['abstract'])
        print("   To view full article, use following link: " + "https://arxiv.org/pdf/"+trainJson.iloc[aaa[0][i]]['id'])
        print('\n')
        i+=1
        


Whole Product

In [None]:

def main(texts):
    print("Imported libraries")
    d = []
    d.append("cs.AI_Artificial Intelligence")
        
    d.append("cs.CL_Computation and Language")
        
    d.append("cs.CC_Computational Complexity")
        
    d.append("cs.CE_Computational Engineering, Finance, and Science")
        
    d.append("cs.CG_Computational Geometry")
        
    d.append("cs.GT_Computer Science and Game Theory")
        
    d.append("cs.CV_Computer Vision and Pattern Recognition")
        
    d.append("cs.CY_Computers and Society")

    d.append("cs.CR_Cryptography and Security")
        
    d.append("cs.DS_Data Structures and Algorithms")
        
    d.append("cs.DB_Databases")
        
    d.append("cs.DL_Digital Libraries")
        
    d.append("cs.DM_Discrete Mathematics")
        
    d.append("cs.DC_Distributed, Parallel, and Cluster Computing")
        
    d.append("cs.ET_Emerging Technologies")
        
    d.append("cs.FL_Formal Languages and Automata Theory")

    d.append("cs.GL_General Literature")

    d.append("cs.GR_Graphics")

    d.append("cs.AR_Hardware Architecture")

    d.append("cs.HC_Human-Computer Interaction")

    d.append("cs.IR_Information Retrieval")

    d.append("cs.IT_Information Theory")

    d.append("cs.LO_Logic in Computer Science")

    d.append("cs.LG_Machine Learning")

    d.append("cs.MS_Mathematical Software")

    d.append("cs.MA_Multiagent Systems")

    d.append("cs.MM_Multimedia")

    d.append("cs.NI_Networking and Internet Architecture")

    d.append("cs.NE_Neural and Evolutionary Computing")

    d.append("cs.NA_Numerical Analysis")

    d.append("cs.OS_Operating Systems")

    d.append("cs.OH_Other Computer Science")

    d.append("cs.PF_Performance")

    d.append("cs.PL_Programming Languages")

    d.append("cs.RO_Robotics")

    d.append("cs.SI_Social and Information Networks")

    d.append("cs.SE_Software Engineering")

    d.append("cs.SD_Sound")

    d.append("cs.SC_Symbolic Computation")

    d.append("cs.SY_Systems and Control")

    categoriesDict = {x.split('_')[0] : x.split('_')[1] for x in d}

    categories = [x.split('_')[0] for x in d]
    with open("x_upd.pckl",'rb') as f:
        xx = pickle.load(f)
    with open("cv_upd.pckl",'rb') as f:
        cvv = pickle.load(f)
    print('Almost done importing model...')
    with open('trainDataset.json', 'r') as f:
        trainJson = pd.read_json(f.read())
    print("Imported model and started the program")
    ind = 1
    for text in texts:
        print("Text " + str(ind) +":")
        kneigh(text, 3, xx, cvv, categories, categoriesDict,trainJson)
        ind += 1