## **Magic Functions**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## **Necessary Imports**

In [None]:
import os
import re
import pandas as pd

## **Drive Mount**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Initializing folder**

In [None]:
%cd /content/drive/MyDrive/keyword_categorization/

/content/drive/MyDrive/keyword_categorization


In [None]:
scraped_data_path = 'data/scraped_data'
files = os.listdir(f"{scraped_data_path}")
csv_files = [file for file in files if file.endswith(".csv")]
print(f"Aavailable Files: {csv_files}")

Aavailable Files: ['paper_details_ieee_access.csv', 'paper_details_more.csv', 'paper_details.csv']


In [None]:
df = pd.DataFrame()

In [None]:
for file in csv_files:
  file_path = f"{scraped_data_path}/{file}"
  data_frame = pd.read_csv(f"{file_path}")
  df = pd.concat([df, data_frame], ignore_index=True)

In [None]:
# df.to_csv(f"{data_path}/all_papers_details.csv", index=False)

In [None]:
# df = pd.read_csv(f"{data_path}/all_papers_details.csv")

In [None]:
df.shape

(41964, 3)

In [None]:
df.head()

Unnamed: 0,abstracts,ieee_keywords,author_keywords
0,The global bandwidth shortage facing wireless ...,[],[]
1,Motivated by the recent explosion of interest ...,"['Distributed processing', 'Internet of things...","['blockchain', 'distributed systems', 'Interne..."
2,At the dawn of the fourth industrial revolutio...,"['Conferences', 'Machine learning', 'Market re...","['Explainable artificial intelligence', 'inter..."
3,The Internet of Things (IoT) makes smart objec...,"['Internet of things', 'Medical services', 'Ne...","['Internet of Things', 'Health Care', 'Service..."
4,"In the near future, i.e., beyond 4G, some of t...","['5G mobile communication', 'Cloud computing',...","['5G', 'Cloud', 'D2D', 'Massive MIMO', 'mm-wav..."


In [None]:
df.columns

Index(['abstracts', 'ieee_keywords', 'author_keywords'], dtype='object')

In [None]:
df.isna().sum()

abstracts          1129
ieee_keywords         0
author_keywords       0
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
df.isna().sum()

abstracts          0
ieee_keywords      0
author_keywords    0
dtype: int64

In [None]:
available_abstracts = df['abstracts'].value_counts().to_dict()

In [None]:
inoappropriate_abstracts = [key for key, value in available_abstracts.items() if value != 1]

# inoappropriate_abstracts = []

# for key, value in val.items():
#   if value != 1:
#     inoappropriate_abstracts.append(key)

In [None]:
len(inoappropriate_abstracts) , inoappropriate_abstracts

(126,
 ['Not applicable: This submission does not include human or animal research.',
  'Employers desirous of obtaining the services of Electrical Engineers, Electrical Tradesmen, and Men or Learners for electrical work, may specify their requirements by means of advertisements in this column, and',
  'Members of the Institute visiting Cape Town are cordially invited to attend general meetings of the Cape Western Local Centre which are held in the Demonstration Theatre, Electricity House, Strand Street, Cape Town, on the second Thursday of each month.',
  'IEEE Plagiarism Policy',
  'Deviations of frequencies and time signals are with respect to the weighted mean of six quartz oscillators, and time signals from WWV over 20-day intervals. A positive frequency deviation indicates that the frequency was high. A time deviation of 010 indicates that the time pulses were 0.010 second late on WWV while 990 indicates that they were 0.010 second early. A value of 46.3 milliseconds has been ado

In [None]:
indices_to_drop = [ index for index in range(len(df)) if df.iloc[index]['abstracts'] in inoappropriate_abstracts]

In [None]:
df = df.drop(indices_to_drop).reset_index(drop=True)

In [None]:
df.shape

(40457, 3)

In [None]:
indices_to_drop.clear()

for i in range(len(df)):
  temp = str(df.iloc[i]['ieee_keywords'])
  temp_list = [ item.replace(" '","").replace("'","").lower() for item in re.split(",", temp.replace("[","").replace("]","")) ]
  for space in temp_list:
    if space == "":
      indices_to_drop.append(i)
      break

In [None]:
df = df.drop(indices_to_drop).reset_index(drop=True)

In [None]:
df.shape

(39896, 3)

In [None]:
for index in range(len(df)):
  df.iloc[index]['ieee_keywords'] = [ item.replace(" '","").replace("'","").lower() for item in re.split(",", df.iloc[index]['ieee_keywords'].replace("[","").replace("]","")) ]
  df.iloc[index]['author_keywords'] = [ item.replace(" '","").replace("'","").lower() for item in re.split(",", df.iloc[index]['author_keywords'].replace("[","").replace("]","")) ]

In [None]:
keywords = []

for index in range(len(df)):
  words = df.iloc[index]['ieee_keywords']
  temp = [ word for word in df.iloc[index]['author_keywords'] if word not in words ]
  words.extend(temp)
  keywords.append(words)

In [None]:
df['draft_keywords'] = keywords

In [None]:
df = df.drop(columns=['ieee_keywords', 'author_keywords']).reset_index(drop=True)

In [None]:
df.shape

(39896, 2)

In [None]:
df.head()

Unnamed: 0,abstracts,draft_keywords
0,Motivated by the recent explosion of interest ...,"[distributed processing, internet of things, c..."
1,At the dawn of the fourth industrial revolutio...,"[conferences, machine learning, market researc..."
2,The Internet of Things (IoT) makes smart objec...,"[internet of things, medical services, network..."
3,"In the near future, i.e., beyond 4G, some of t...","[5g mobile communication, cloud computing, mim..."
4,The future of mobile communications looks exci...,"[wireless networks, 5g mobile communication, s..."


In [None]:
available_keywords = {}

keywords_list = df['draft_keywords'].to_list()

for item in range(len(keywords_list)):
  for keyword in keywords_list[item]:
    if keyword in available_keywords.keys():
      available_keywords[keyword] += 1
    else:
      available_keywords[keyword] = 1

In [None]:
threshold = 0.003

In [None]:
chosen_keywords = [key for key, value in available_keywords.items() if value >= int(len(available_keywords)*0.004) and key != ""]

# for key, value in available_keywords.items():
#   if value >= int(len(available_keywords)*threshold) and key != "":
#     choosen_keywords.append(key)

In [None]:
chosen_keywords

['internet of things',
 'privacy',
 'blockchain',
 'machine learning',
 'prediction algorithms',
 'biological system modeling',
 'security',
 '5g mobile communication',
 'cloud computing',
 'wireless communication',
 'wireless sensor networks',
 'antenna arrays',
 'bandwidth',
 'imaging',
 'remote sensing',
 'computational modeling',
 'neural networks',
 'task analysis',
 'predictive models',
 'deep learning',
 'training',
 'testing',
 'support vector machines',
 'databases',
 'artificial intelligence',
 'logic gates',
 'servers',
 'computer architecture',
 'feature extraction',
 'optimization',
 'data mining',
 'real-time systems',
 'convergence',
 'convolution',
 'convolutional neural networks',
 'optical fiber communication',
 'radio frequency',
 'modulation',
 'algorithm design and analysis',
 'data models',
 'monitoring',
 'time series analysis',
 'convolutional neural network',
 'classification algorithms',
 'object detection',
 'detectors',
 'strain',
 'standards',
 'solid model

In [None]:
final_keywords = []

for i in range(len(df)):
  temp = []
  for word in df.iloc[i]['draft_keywords']:
    if word in chosen_keywords:
      temp.append(word)
  final_keywords.append(temp)

In [None]:
df['final_keywords'] = final_keywords

In [None]:
indices_to_drop.clear()

indices_to_drop = [index for index in range(len(df)) if not len(df.iloc[index]['final_keywords'])]

# for index in range(len(df)):
#   if not len(df.iloc[index]['final_keywords']):
#     indices_to_drop.append(index)

In [None]:
df = df.drop(indices_to_drop).reset_index(drop=True)

In [None]:
df = df.drop(columns=['draft_keywords']).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,abstracts,final_keywords
0,Motivated by the recent explosion of interest ...,"[internet of things, privacy, blockchain]"
1,At the dawn of the fourth industrial revolutio...,"[machine learning, prediction algorithms, biol..."
2,The Internet of Things (IoT) makes smart objec...,"[internet of things, biological system modelin..."
3,"In the near future, i.e., beyond 4G, some of t...","[5g mobile communication, cloud computing]"
4,The future of mobile communications looks exci...,[5g mobile communication]


In [None]:
df.shape

(36398, 2)

In [None]:
data_path = "data"
df.to_csv(f"{data_path}/papers_final_data.csv", index=False)