In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# This notebook covers essential feature extraction techniques used in Natural Language Processing (NLP).
টেক্সট ডেটাকে মেশিন লার্নিং মডেলের জন্য উপযোগী সংখ্যাসূচক আকারে রূপান্তর করার জন্য ফিচার এক্সট্রাকশন খুবই গুরুত্বপূর্ণ

## Bag of Words (BOW) | Feature Extraction | NPL

টেক্সটকে একটি "ব্যাগ" হিসেবে ধরা হয় যেখানে শব্দগুলোর ক্রম বা ব্যাকরণ কোনো গুরুত্ব পায় না, শুধু শব্দগুলো কতবার এসেছে সেটা গোনা হয়।

In [2]:
text_data = ['I am interested in NLP','This is a good tutorial with good topic', 'Feature extraction is very important topic']
text_data

['I am interested in NLP',
 'This is a good tutorial with good topic',
 'Feature extraction is very important topic']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [4]:
# fit the data
bow.fit(text_data)

In [5]:
# get the vocabulary list
bow.get_feature_names_out()

array(['extraction', 'feature', 'good', 'important', 'interested', 'nlp',
       'topic', 'tutorial'], dtype=object)

In [6]:
bow_features = bow.transform(text_data)
print(bow_features)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 8)>
  Coords	Values
  (0, 4)	1
  (0, 5)	1
  (1, 2)	2
  (1, 6)	1
  (1, 7)	1
  (2, 0)	1
  (2, 1)	1
  (2, 3)	1
  (2, 6)	1


In [7]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]])

In [9]:
print(bow.get_feature_names_out())

print("\n\n")
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)

['extraction' 'feature' 'good' 'important' 'interested' 'nlp' 'topic'
 'tutorial']



I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


# TF-IDF (Term Frequency/Inverse Document Frequency)

TF-IDF একটি জনপ্রিয় টেক্সট ফিচার এক্সট্রাকশন পদ্ধতি, যা বোঝায় একটি শব্দ নির্দিষ্ট ডকুমেন্টে কতবার এসেছে (TF) এবং সেই শব্দটি পুরো ডেটাসেটে কতটা বিরল বা সাধারণ (IDF)। এটি Bag of Words-এর তুলনায় উন্নত, কারণ এটি সাধারণ বা কম গুরুত্বপূর্ণ শব্দের মান কমিয়ে দেয় এবং গুরুত্বপূর্ণ শব্দকে হাইলাইট করে।


In [12]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']
text_data

['I am interested in NLP',
 'This is a good tutorial with good topic',
 'Feature extraction is very important topic']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [14]:
# fit the data
tfidf.fit(text_data)

In [15]:
# get the vocabulary list
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [16]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (3, 8)>

In [17]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [18]:
for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

I am interested in NLP
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 8)>
  Coords	Values
  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476
This is a good tutorial with good topic
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 8)>
  Coords	Values
  (0, 7)	0.42339448341195934
  (0, 6)	0.3220024178194947
  (0, 2)	0.8467889668239187
Feature extraction is very important topic
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (1, 8)>
  Coords	Values
  (0, 6)	0.4020402441612698
  (0, 3)	0.5286346066596935
  (0, 1)	0.5286346066596935
  (0, 0)	0.5286346066596935


# Word2vec

Word2Vec হলো একটি শব্দ এম্বেডিং টেকনিক, যা প্রতিটি শব্দকে একটি ঘন ও অর্থবোধক ভেক্টরে রূপান্তর করে। এটি শব্দের অর্থগত মিল/সম্পর্ক শেখে এবং কাজ করে নিউরাল নেটওয়ার্ক ব্যবহার করে। TF-IDF বা BOW যেখানে শুধু ফ্রিকোয়েন্সি দেখে, সেখানে Word2Vec শব্দের আসল অর্থ ও প্রসঙ্গ ধরে।

TF-IDF শুধু গুনফল (ওজন) দেয়, অথচ Word2Vec শব্দের "অর্থ" শেখে।
এর ফলে শব্দ যেমন: king - man + woman ≈ queen — এমন সম্পর্ক শেখা সম্ভব হয়।

In [19]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [20]:
# text data
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [22]:
# initialize and fit the data
model = Word2Vec(common_texts, vector_size=100, min_count=1)

In [23]:
model.wv['graph']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      

In [25]:
model.wv.most_similar('graph')

[('user', 0.06793874502182007),
 ('survey', 0.03364056348800659),
 ('eps', 0.009391174651682377),
 ('human', 0.008315952494740486),
 ('minors', 0.0045030140317976475),
 ('system', -0.010839183814823627),
 ('trees', -0.023671666160225868),
 ('computer', -0.09575340896844864),
 ('time', -0.11410722136497498),
 ('response', -0.11557212471961975)]

# Word Embedding using Glove