# CountVec from scratch

Based on [this tutorial](https://medium.com/@saivenkat_/implementing-countvectorizer-from-scratch-in-python-exclusive-d6d8063ace22)

## Part 1: Get unique words and their index

In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
csv = pd.read_csv('../datasetall.csv')

csv

Unnamed: 0,text,label,Unnamed: 2
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0,
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0,
2,wait so ur telling me Let Leni Lead mo pero NY...,1,
3,[USERNAME]wish this is just a nightmare that ...,0,
4,doc willie ong and isko sabunutan po,0,
...,...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1,
28457,Amnesia. In my whole life wala pa ako nakasala...,1,
28458,Kontrabida na ilang beses na tinalo at obvious...,1,
28459,Yung antagonist laging kailangang sobrang sama...,1,


In [3]:
INPUT_TEXT = csv['text']

INPUT_TEXT

0        Binay: Patuloy ang kahirapan dahil sa maling p...
1        SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...
2        wait so ur telling me Let Leni Lead mo pero NY...
3         [USERNAME]wish this is just a nightmare that ...
4                     doc willie ong and isko sabunutan po
                               ...                        
28456      Bisaya, Probinsyano/a, mostly Bisaya = katulong
28457    Amnesia. In my whole life wala pa ako nakasala...
28458    Kontrabida na ilang beses na tinalo at obvious...
28459    Yung antagonist laging kailangang sobrang sama...
28460    May nabaril or nasaksak na pero 'di pa tatawag...
Name: text, Length: 28461, dtype: object

In [4]:
# Add unique word to set
UNIQUE_WORDS = set()

# Note: could probably stand to do more preprocessing
# there are a lot of unnecessary or duplicated words
# hence, a lot of noise
# so it makes sense to want to "clean" the text here

# Can make: remove punctuation, lowercase
for sample in INPUT_TEXT:
  for word in sample.split(' '):
    UNIQUE_WORDS.add(word)

In [5]:
UNIQUE_WORDS

{'',
 'labag',
 'leniLaban',
 'flashmob',
 'precinct,',
 'witticism.',
 'Practice',
 'disasters',
 'COLOR',
 'haha..trending',
 'umuusok',
 'Roxas??',
 'MAYBumoto',
 'ka!Wala',
 'sisimulan',
 'Noooooooo...',
 'bbm,',
 'supportersnotomarcosduterte',
 'Michelin',
 'hindImahanap',
 'jejemar',
 'dug',
 '"bobo',
 'SKIPS',
 'BEGGING',
 'Tama.',
 'kk',
 'Kandidato(2016)',
 'stvpid',
 'donations,',
 'nirerelate',
 'MayThe',
 'SASABHN',
 'Dagdag/bawas',
 'Kapikon',
 'magdobidobidapdap',
 'uuyuy',
 'binabantayan',
 'Tough',
 'Blazing',
 'BALA',
 'mamemeke',
 'time?',
 'magpakailanman',
 'Aboard',
 'MATULOG',
 'plausible',
 'avibe',
 'RIN.',
 'Truuuuuuuuue',
 'criticizes',
 'weather!BaguioChel',
 'Charet.',
 'IPAGLABA',
 'direct',
 'Duterte.Logic',
 'fastest',
 'JoemsxczEXST',
 'LeniREPORMAPara',
 'Vice...',
 'tanim-bala.',
 'hadg',
 'Lean',
 'Edrian',
 'P11B.',
 'Sr.',
 'Remind',
 'Empower',
 'KakampinkLet',
 'geisha',
 'Pambansang',
 'adspremature',
 'thinner',
 'atlest',
 'ROXASang',
 'Libel',

In [6]:
len(UNIQUE_WORDS)

76871

In [7]:
# Make our dictionaries
WORD_TO_INDEX = {}
INDEX_TO_WORD = {}

for index, word in enumerate(sorted(list(UNIQUE_WORDS))):
  WORD_TO_INDEX[word] = index
  INDEX_TO_WORD[index] = word

In [8]:
WORD_TO_INDEX

{'': 0,
 '\n\nCNN': 1,
 '\n\nDuterte': 2,
 '\n\nKita': 3,
 '\n\nMAGLABAS': 4,
 '\n\nUlul\nDouble\nStandard': 5,
 '\n\ntangina': 6,
 '\n\nthis': 7,
 '\n\ntw': 8,
 '\n[USERNAME]\n': 9,
 '!': 10,
 '!!': 11,
 '!!!': 12,
 '!!!!': 13,
 '!!!!!': 14,
 '!!!!!!': 15,
 '!!!!!!!!': 16,
 '!!!!!!!!!!!': 17,
 '!!!!Iboboto': 18,
 '!!!!Ipanalo': 19,
 '!!!Leni': 20,
 '!!!Thank': 21,
 '!!![USERNAME]': 22,
 '!!!made': 23,
 '!!!think': 24,
 '!!...': 25,
 '!!EDSA': 26,
 '!!Gagawa': 27,
 '!!InshaAllah': 28,
 '!!Leni': 29,
 '!!ROBREDO': 30,
 '!!Thank': 31,
 '!!anak': 32,
 '!"': 33,
 '!.': 34,
 '!?': 35,
 '!????': 36,
 '!Araw': 37,
 '!BBM': 38,
 '!DI': 39,
 '!Ipanlo': 40,
 '!Kulay': 41,
 '!Layout': 42,
 '!Leni': 43,
 '!Let': 44,
 '!Neri': 45,
 '!Non': 46,
 '!PRAY': 47,
 '!Pampanga': 48,
 '!Panaloang': 49,
 '!Pero': 50,
 '!Youth': 51,
 '![USERNAME]': 52,
 '!artph': 53,
 '!kakampink': 54,
 '!love': 55,
 '!sko': 56,
 '!via[USERNAME]': 57,
 '"': 58,
 '""Andy': 59,
 '""Tito""': 60,
 '"%': 61,
 '"(Photo': 62,
 '"-':

In [9]:
INDEX_TO_WORD

{0: '',
 1: '\n\nCNN',
 2: '\n\nDuterte',
 3: '\n\nKita',
 4: '\n\nMAGLABAS',
 5: '\n\nUlul\nDouble\nStandard',
 6: '\n\ntangina',
 7: '\n\nthis',
 8: '\n\ntw',
 9: '\n[USERNAME]\n',
 10: '!',
 11: '!!',
 12: '!!!',
 13: '!!!!',
 14: '!!!!!',
 15: '!!!!!!',
 16: '!!!!!!!!',
 17: '!!!!!!!!!!!',
 18: '!!!!Iboboto',
 19: '!!!!Ipanalo',
 20: '!!!Leni',
 21: '!!!Thank',
 22: '!!![USERNAME]',
 23: '!!!made',
 24: '!!!think',
 25: '!!...',
 26: '!!EDSA',
 27: '!!Gagawa',
 28: '!!InshaAllah',
 29: '!!Leni',
 30: '!!ROBREDO',
 31: '!!Thank',
 32: '!!anak',
 33: '!"',
 34: '!.',
 35: '!?',
 36: '!????',
 37: '!Araw',
 38: '!BBM',
 39: '!DI',
 40: '!Ipanlo',
 41: '!Kulay',
 42: '!Layout',
 43: '!Leni',
 44: '!Let',
 45: '!Neri',
 46: '!Non',
 47: '!PRAY',
 48: '!Pampanga',
 49: '!Panaloang',
 50: '!Pero',
 51: '!Youth',
 52: '![USERNAME]',
 53: '!artph',
 54: '!kakampink',
 55: '!love',
 56: '!sko',
 57: '!via[USERNAME]',
 58: '"',
 59: '""Andy',
 60: '""Tito""',
 61: '"%',
 62: '"(Photo',
 63: '

## Part 2: Getting count of words

In [10]:
from collections import Counter
from scipy.sparse import csr_matrix

# The three axes of our matrix
row = []
column = []
value = []

for index, sample in enumerate(INPUT_TEXT):

  # Get the count of every word in each sample
  word_count = dict(Counter(sample.split(' ')))

  for word, count in word_count.items():
    # Finding the vocab index of each word in the sample
    # This will represent the column to add the count to
    vocab_index = WORD_TO_INDEX.get(word)

    # When word is found in vocabulary
    if vocab_index >= 0:
      # Determines what's added to the matrix
      # Let the matrix shape be (x, y)
      # At a certain index common to all
      # it will add the sample index to the row (x)
      # then it will add which column the value will be (y)
      # And it tells what is the value, the count, to be added at position (x, y)
      # when we append value
      row.append(index)
      column.append(vocab_index)
      value.append(count)

In [11]:
countvec_matrix = csr_matrix(
  (value, (row, column)),
  shape=(len(INPUT_TEXT), len(WORD_TO_INDEX))
)

In [12]:
print(countvec_matrix)

  (0, 6129)	1
  (0, 24525)	1
  (0, 32980)	1
  (0, 34851)	1
  (0, 40615)	1
  (0, 50068)	1
  (0, 55286)	1
  (0, 63077)	1
  (0, 67989)	1
  (1, 2718)	1
  (1, 3289)	1
  (1, 4116)	1
  (1, 4803)	1
  (1, 6864)	1
  (1, 11461)	1
  (1, 15898)	1
  (1, 16170)	1
  (1, 16284)	1
  (1, 16733)	1
  (1, 17026)	2
  (1, 17145)	2
  (1, 17357)	2
  (1, 21895)	1
  (1, 24360)	1
  (1, 26940)	1
  :	:
  (28459, 67767)	1
  (28459, 67989)	2
  (28459, 68311)	1
  (28459, 70078)	1
  (28459, 70087)	1
  (28459, 71979)	1
  (28459, 73023)	1
  (28459, 76759)	1
  (28460, 990)	1
  (28460, 19825)	1
  (28460, 28926)	1
  (28460, 34667)	1
  (28460, 47781)	1
  (28460, 50874)	1
  (28460, 52382)	1
  (28460, 57484)	1
  (28460, 57610)	1
  (28460, 57788)	1
  (28460, 59871)	1
  (28460, 60350)	1
  (28460, 61899)	1
  (28460, 62161)	1
  (28460, 64168)	1
  (28460, 71826)	1
  (28460, 72071)	1
