In [20]:
# Import necessary libraries
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import multiprocessing
from time import time  # To time our operations
from gensim.models.phrases import Phrases, Phraser
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
raw_numpy = pd.read_csv("numpy.csv")
raw_pandas = pd.read_csv("pandas.csv")
raw_csharp = pd.read_csv("csharp.csv")
raw_matlab = pd.read_csv("matlab.csv")
raw_matplotlib = pd.read_csv("matplotlib.csv")
raw_mongodb = pd.read_csv("mongodb.csv")
raw_tensorflow = pd.read_csv("tensorflow.csv")
raw_typescript = pd.read_csv("typescript.csv")

In [3]:
raw_numpy["Tag"]="numpy"
raw_pandas["Tag"]="pandas"
raw_csharp["Tag"]="csharp"
raw_matlab["Tag"]="matlab"
raw_matplotlib["Tag"]="matplotlib"
raw_mongodb["Tag"]="mongodb"
raw_tensorflow["Tag"]="tensorflow"
raw_typescript["Tag"]="typescript"

In [5]:
df = pd.concat([raw_numpy, raw_pandas, raw_csharp, raw_matlab, raw_matplotlib, raw_mongodb, raw_tensorflow, raw_typescript], axis=0)
df = df.reset_index(drop=True)
df.shape

(360718, 2)

In [6]:
def print_plot(index):
    example = df[df.index == index][['Questions', 'Tag']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])
print_plot(10)
print_plot(300000)

Sum rows of 2D array with elements of 1D array
Tag: numpy
Running automatic annotation in cvat with tensorflow results in status code 400 "No labels found for tf annotation"
Tag: tensorflow


In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['Questions'] = df['Questions'].apply(clean_text)
print_plot(10)
print_plot(300000)



sum rows 2d array elements 1d array
Tag: numpy
running automatic annotation cvat tensorflow results status code 400 labels found tf annotation
Tag: tensorflow


In [9]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [11]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [13]:
sent = [row.split() for row in df['Questions']]
sent

[['numpy',
  'savez',
  'list',
  '3d',
  'arrays',
  'different',
  'shapes',
  'fails',
  'stacking',
  'broadcasting',
  'shouldnt',
  'simply',
  'zipping',
  'separate',
  'npy',
  'files'],
 ['dataframe', 'object', 'attribute', 'close', 'pandas', 'fix', 'problem'],
 ['look', 'multiple', 'values', 'one', 'column', 'python', 'duplicate'],
 ['fastest',
  'data',
  'types',
  'use',
  'alternative',
  'c++',
  'vectors',
  'python'],
 ['rgb2hed', 'giving', 'answer', 'applied', 'full', 'part', 'matrix'],
 ['reduce', 'time', 'complexity', 'dfapply'],
 ['replace', 'double', 'loop', 'numpy'],
 ['load', 'txt', 'numpy', '2d', 'array', 'dictionaries', 'objects'],
 ['using', 'to_pickle', 'store', 'large', 'dataframes'],
 ['iam',
  'studyng',
  'data',
  'science',
  'dont',
  'know',
  'make',
  'labels',
  'x',
  'dont',
  'stay',
  'together',
  'duplicate'],
 ['sum', 'rows', '2d', 'array', 'elements', '1d', 'array'],
 ['matplotlib', 'get', 'precise', 'point', 'intersection'],
 ['find', 'p

In [14]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [15]:
bigram = Phraser(phrases)

In [17]:
sentences = bigram[sent]
sentences

<gensim.interfaces.TransformedCorpus at 0x2b3c51b7be0>

In [18]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.04 mins


In [21]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:38:44: Word2Vec lifecycle event {'msg': 'training model with 15 workers on 7249 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-04-15T18:38:44.359674', 'gensim': '4.3.1', 'python': '3.8.0 (default, Nov  6 2019, 16:00:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
INFO - 18:38:45: EPOCH 0 - PROGRESS: at 19.18% examples, 181676 words/s, in_qsize 27, out_qsize 1
INFO - 18:38:46: EPOCH 0 - PROGRESS: at 49.38% examples, 243470 words/s, in_qsize 20, out_qsize 9
INFO - 18:38:47: EPOCH 0 - PROGRESS: at 83.77% examples, 276060 words/s, in_qsize 29, out_qsize 0
INFO - 18:38:47: EPOCH 0: training on 2246767 raw words (1008288 effective words) took 3.1s, 323792 effective words/s
INFO - 18:38:48: EPOCH 1 - PROGRESS: at 26.86% examples, 256663 words/s, in_qsize 13, out_qsize 9
INFO - 18:38:49: EPOCH 1 - PROGRESS: at 58.28% examples, 280122 words/s, in_qsize 29, out_qsize 0
I

Time to train the model: 1.64 mins


In [23]:
w2v_model.wv.most_similar(positive=["c#"])

[('net', 0.6174273490905762),
 ('net_6', 0.4837472140789032),
 ('net_core', 0.4776083827018738),
 ('aspnet_core', 0.40938764810562134),
 ('c#net', 0.406419962644577),
 ('application', 0.4008021056652069),
 ('aspnet', 0.40076929330825806),
 ('database', 0.3891719877719879),
 ('net_framework', 0.374855637550354),
 ('visual_studio', 0.3744681179523468)]

In [24]:
# Split your dataset into training and testing sets
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=42)


In [42]:
train_sentences_2 = train_sentences.apply(lambda x: x.join(' '))

AttributeError: 'list' object has no attribute 'apply'

In [29]:
train_X = [w2v_model.wv[key] for sentence in train_sentences for key in sentence if key in w2v_model.wv.key_to_index]
train_y = df.loc[df['Questions'].isin(train_sentences), 'Tag']
test_X = [w2v_model.wv[key] for sentence in test_sentences for key in sentence if key in w2v_model.wv.key_to_index]
test_y = df.loc[df['Questions'].isin(test_sentences), 'Tag']

In [27]:
train_X

[array([-8.05303514e-01, -1.32531488e+00, -1.60888001e-01,  2.05488253e+00,
        -4.87715244e-01,  6.53871834e-01,  8.63425910e-01, -2.81674117e-01,
         3.35726708e-01, -5.74096620e-01,  9.58508775e-02,  3.32639247e-01,
         7.59899080e-01, -9.88468468e-01, -3.01471561e-01,  6.65525377e-01,
        -5.13846159e-01,  2.43683159e-01,  3.54380280e-01, -6.03960156e-01,
         8.02381992e-01, -9.02945846e-02,  1.08268189e+00,  1.09888422e+00,
        -4.97945011e-01,  2.79424220e-01, -6.85381234e-01, -4.87583637e-01,
         9.15395245e-02,  9.52757537e-01, -3.43196243e-01,  5.07465959e-01,
        -4.11146879e-01, -4.06403989e-01,  5.45375824e-01,  2.96000719e-01,
         5.84677637e-01, -5.98665476e-01,  5.62460423e-01,  4.33628947e-01,
        -3.31736803e-01,  2.21002609e-01, -6.53031945e-01,  1.90772727e-01,
        -5.36564052e-01,  2.88818866e-01, -5.61120957e-02, -8.35755467e-02,
         4.00397042e-03, -8.64823997e-01,  2.02923492e-01,  2.52763927e-01,
         1.0

In [30]:
# Create a logistic regression model
clf = LogisticRegression(max_iter=1000)

In [39]:
train_sentences

[['reduce', 'memory', 'consumption', 'splitting', 'pandas_dataframe'],
 ['realtime', 'streaming', 'data', 'python', 'matlab'],
 ['multitask', 'classification', 'softmax', 'function'],
 ['asci',
  'characters',
  'change',
  'matlab',
  'switching',
  'windows',
  'macintosh',
  'linux'],
 ['integrals', 'functions', 'matplotlib'],
 ['remove', 'duplicate', 'values', 'tensor', 'tensorflow'],
 ['matlab', 'matrix', 'element_wise', 'multiplication', 'optimization'],
 ['getting',
  'serialization',
  'deserialization',
  'systemtype',
  'instances',
  'supported',
  'path',
  'entitymapperfortype'],
 ['matplotlib', 'legend', 'showing', 'correctly'],
 ['selecting', 'loss', 'metrics', 'tensorflow', 'model'],
 ['tensorflow',
  'library',
  'wasnt',
  'compiled',
  'use',
  'sse41',
  'sse42',
  'avx',
  'avx2',
  'instructions',
  'speed',
  'cpu',
  'computations'],
 ['overflow', 'container', 'taking', 'ages', 'render', 'list', 'scrolling'],
 ['matplotlib', 'yaxis', 'scale', 'match', 'data'],
 

In [40]:
df.head()

Unnamed: 0,Questions,Tag
0,numpy savez list 3d arrays different shapes fa...,numpy
1,dataframe object attribute close pandas fix pr...,numpy
2,look multiple values one column python duplicate,numpy
3,fastest data types use alternative c++ vectors...,numpy
4,rgb2hed giving answer applied full part matrix,numpy
