In [None]:
%pip install wordcloud
%pip install seaborn
%pip install ipywidgets
%pip install nltk

## ETL

In [None]:
#Extraction
projects = pd.read_csv('projects.csv')
tags = pd.read_csv('tags.csv')
projects.head()

In [None]:
tags.head()

In [None]:
#Transform
df = pd.merge(tags,projects,on='id')
df.head()

In [None]:
df.tag.isnull()

In [None]:
df = df.loc[df.tag.notnull(),:]

In [None]:
#Save the transformed data
df.to_csv('labeled_projects.csv',index=False)

## EDA

In [None]:
from collections import Counter
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from wordcloud import WordCloud, STOPWORDS
sns.set_theme()
warnings.filterwarnings("ignore")
import re

In [2]:
df = pd.read_csv('labeled_projects.csv')

In [3]:
df.head()

Unnamed: 0,id,tag,created_on,title,description
0,6,computer-vision,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...
1,7,computer-vision,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...
2,9,graph-learning,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla..."
3,15,reinforcement-learning,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...
4,19,graph-learning,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...


In [None]:
#Tags distribution
df.tag.hist()

In [None]:
#Wordcloud of description feature
def get_wc(df,tag):
    tag_titles = df.loc[df.tag==tag,'title']
    cloud = WordCloud(stopwords=STOPWORDS,collocations=False,width=500, height=300).generate(' '.join(tag_titles.to_list()))
    plt.imshow(cloud)


In [None]:
get_wc(df,'natural-language-processing')

In [None]:
get_wc(df,'mlops')

In [None]:
get_wc(df,'time-series')

In [None]:
df.tag.unique()

## Preprocessing

In [4]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [20]:
df = pd.read_csv('labeled_projects.csv')

In [21]:
df.head()

Unnamed: 0,id,tag,created_on,title,description
0,6,computer-vision,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...
1,7,computer-vision,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...
2,9,graph-learning,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla..."
3,15,reinforcement-learning,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...
4,19,graph-learning,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...


In [22]:
df['text'] = df.title +' ' + df.description
df.text 

0      Comparison between YOLO and RCNN on real world...
1      Show, Infer & Tell: Contextual Inference for C...
2      Awesome Graph Classification A collection of i...
3      Awesome Monte Carlo Tree Search A curated list...
4      Diffusion to Vector Reference implementation o...
                             ...                        
950    A Survey of the State of Explainable AI for NL...
951    Topic Modeling with BERT Leveraging 🤗 Transfor...
952    OpenMMLab Computer Vision MMCV is a python lib...
953    Machine Learning Methods Explained (+ Examples...
954    Rasoee A powerful web and mobile application t...
Name: text, Length: 955, dtype: object

In [23]:
# Clean the text
def clean_text(text,stopwords):

    text = text.lower()
    
    #Remove non-alphanumeric
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text )  
    text = re.sub("[^A-Za-z0-9]+", " ", text) 

    #Remove links
    text = re.sub('http\S+',"",text)
    
    #tokenize
    text = text.split()

    #Remove  stopwords 
    new_text = []
    for word in text:   
        if word not in stopwords:
            new_text.append(word)
    text = new_text 
    
    # Stem
    stemmer = PorterStemmer()
    text = stemmer.stem(' '.join(text))
    
    return text

In [24]:
# 2. lower all the text tokens
df['text'] = df['text'].apply(clean_text,args=(STOPWORDS,))

In [25]:
df.head()

Unnamed: 0,id,tag,created_on,title,description,text
0,6,computer-vision,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,comparison yolo rcnn real world videos bringin...
1,7,computer-vision,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,show infer tell contextual inference creative ...
2,9,graph-learning,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",awesome graph classification collection import...
3,15,reinforcement-learning,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,awesome monte carlo tree search curated list m...
4,19,graph-learning,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,diffusion vector reference implementation diff...


In [35]:
# Remove other irrelevant tags
def remove_oos_labels(df,label,default,freq):
    oos_labels = []
    for tag,tag_freq in Counter(df[label]).most_common():
        if tag_freq<freq:
            oos_labels.append(tag)
    df.loc[df[label].isin(oos_labels), label] = default
    return df



In [38]:
# df = remove_oos_labels(df,'tag','other',75)
Counter(df.tag)

Counter({'computer-vision': 356,
         'other': 132,
         'natural-language-processing': 388,
         'mlops': 79})

In [39]:
#Label Ecnoding
from sklearn.preprocessing import LabelEncoder


In [40]:
le = LabelEncoder()
df['tag'] = le.fit_transform(df['tag'])

In [41]:
df.head()

Unnamed: 0,id,tag,created_on,title,description,text
0,6,0,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,comparison yolo rcnn real world videos bringin...
1,7,0,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,show infer tell contextual inference creative ...
2,9,3,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",awesome graph classification collection import...
3,15,3,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,awesome monte carlo tree search curated list m...
4,19,3,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,diffusion vector reference implementation diff...


In [42]:
le.classes_

array(['computer-vision', 'mlops', 'natural-language-processing', 'other'],
      dtype=object)

In [44]:
le.transform(['computer-vision', 'mlops',
       'natural-language-processing', 'other'])

array([0, 1, 2, 3])

In [45]:
df.head()

Unnamed: 0,id,tag,created_on,title,description,text
0,6,0,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,comparison yolo rcnn real world videos bringin...
1,7,0,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,show infer tell contextual inference creative ...
2,9,3,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",awesome graph classification collection import...
3,15,3,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,awesome monte carlo tree search curated list m...
4,19,3,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,diffusion vector reference implementation diff...


In [47]:
df.drop(['created_on','title','description','id'],inplace=True,axis=1)

In [48]:
df.head()

Unnamed: 0,tag,text
0,0,comparison yolo rcnn real world videos bringin...
1,0,show infer tell contextual inference creative ...
2,3,awesome graph classification collection import...
3,3,awesome monte carlo tree search curated list m...
4,3,diffusion vector reference implementation diff...


In [49]:
df.to_csv('prep_labelled_projects.csv',index=False)


## Splitting

In [3]:
# Split using test_train split of sklearn
from sklearn.model_selection import train_test_split
import pandas as pd


In [4]:
df = pd.read_csv('prep_labelled_projects.csv')
df.head()

Unnamed: 0,tag,text
0,0,comparison yolo rcnn real world videos bringin...
1,0,show infer tell contextual inference creative ...
2,3,awesome graph classification collection import...
3,3,awesome monte carlo tree search curated list m...
4,3,diffusion vector reference implementation diff...


In [None]:
X_train,X_,y_train,y_ = train_test_split(df.text.to_numpy(),df.tag.to_numpy(),test_size=0.3,stratify=df.tag)
X_train

In [6]:
X_val,X_test,y_val,y_test = train_test_split(X_,y_,test_size=0.5,stratify=y_)

In [7]:
print("X_train, X_val, X_test: ",len(X_train),len(X_val), len(X_test))
print("y_train, y_val, y_test: ",len(y_train),len(y_val), len(y_test))


X_train, X_val, X_test:  668 143 144
y_train, y_val, y_test:  668 143 144


## Augmentation