# KERNEL > RESTART & RUN ALL

In [1]:
#import libraries needed
import pandas as pd
import os
import re
from nltk.stem.porter import *
import math

In [2]:
#read the data_set being used
data = pd.read_csv("Reviews_8000.csv")

#globals
unigramDict = {}
bigramDict = {}
stemmer = PorterStemmer()
bigramVM = []

In [3]:
def getNgrams(text, n):
    if n == 1:
        return text
    
    text.append("<end>")
    ngramList = []
    for i in range(len(text) - (n-1)):
        ngram = text[i:i+n]
        ngramList.append(ngram)
    
    return ngramList

def getCountsinRow(row):
    text = row["text"]
    parsedText = re.findall("(\w+)", text)
    
    for index in range(len(parsedText)):
        parsedText[index] = stemmer.stem(parsedText[index])
        parsedText[index] = parsedText[index].lower()
        
    unigram = getNgrams(parsedText, 1)
    bigram = getNgrams(parsedText, 2)
    ngrams = (unigram,bigram)
    for index in range(len(ngrams)):
        for eachWord in ngrams[index]:
            
            if index == 0:
                #unigram
                
                if eachWord in unigramDict:
                    unigramDict[eachWord] +=1
                else:
                    unigramDict[eachWord] =1
                    
            if index == 1:
                #bigram
                    eachWord = tuple(eachWord)
                    if eachWord in bigramDict:
                        bigramDict[eachWord] +=1
                    else:
                        bigramDict[eachWord] =1
                        
def getBigramVM(row):
    text = row["text"]
    score = row["score"]
    rowVector = {}
    
    parsedText = re.findall("(\w+)", text)
    for index in range(len(parsedText)):
        parsedText[index] = stemmer.stem(parsedText[index])
        parsedText[index] = parsedText[index].lower()
        
    bigrams = getNgrams(parsedText, 2)
    
    for eachgram in bigrams:
        eachgram = tuple(eachgram)
        if eachgram in rowVector:
            rowVector[eachgram] +=1
        else:
            rowVector[eachgram] = 1
    
    
    if score >= 4:
        rowVector["<label>"] = "good"
    else:
        rowVector["<label>"] = "bad"
        
    bigramVM.append(rowVector)
            

In [4]:
#run
data.apply(getCountsinRow, axis = 1)
data.apply(getBigramVM, axis = 1)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
1970    None
1971    None
1972    None
1973    None
1974    None
1975    None
1976    None
1977    None
1978    None
1979    None
1980    None
1981    None
1982    None
1983    None
1984    None
1985    None
1986    None
1987    None
1988    None
1989    None
1990    None
1991    None
1992    None
1993    None
1994    None
1995    None
1996    None
1997    None
1998    None
1999    None
Length: 2000, dtype: object

In [None]:
DF_BIGRAM_VM = pd.DataFrame(bigramVM)

In [None]:
DF_BIGRAM_VM = DF_BIGRAM_VM.fillna(0)
print(DF_BIGRAM_VM)

In [None]:
DF_BIGRAM_VM.apply(startDebug, axis = 1)

In [None]:
DFCopy = DF_BIGRAM_VM.head(0)
DFCopy.fillna(0)
for column in DFCopy:
    print(str(column) + ":" + DFCopy[column])

In [5]:
#we have fast ways to build the term frequencies, now we need a fast way to get the TFIDF
invertedIndex = {}
'''
invertedIndex
the => {index : 1}
'''

def makeInvertedIndex(row):
    #get the bigrams
    #for each bigram -> add a count to the inverted index
    #do not double count
    
    #get bigrams
    text = row["text"]
    parsedText = re.findall("(\w+)", text)
    
    for index in range(len(parsedText)):
        
        parsedText[index] = stemmer.stem(parsedText[index])
        parsedText[index] = parsedText[index].lower()
     
    bigrams = getNgrams(parsedText, 2)
    
    #put into invertedIndex
    for eachBigram in bigrams:
        eachBigram = tuple(eachBigram)
        if eachBigram not in invertedIndex:
            invertedIndex[eachBigram] = [1, {}]

        if row.name not in invertedIndex[eachBigram]:
            invertedIndex[eachBigram][0] += 1
            invertedIndex[eachBigram][1][row.name] = 1
    

In [8]:
BigramTFIDFVM = []
def getBigramTFIDFModel(row):
    
    #get bigrams
    rowVector = {}
    text = row["text"]
    score = row["score"]
    parsedText = re.findall("(\w+)", text)
    
    for index in range(len(parsedText)):
        
        parsedText[index] = stemmer.stem(parsedText[index])
        parsedText[index] = parsedText[index].lower()
     
    bigrams = getNgrams(parsedText, 2)
    
    #record counts
    for eachGram in bigrams:
        eachGram = tuple(eachGram)
        if eachGram not in rowVector:
            rowVector[eachGram] = 1
        else:
            rowVector[eachGram] +=1
            
    #get TF*IDF
    for eachKey in rowVector:
        count = rowVector[eachKey]
        TF = bigramDict[eachKey] * count
        
        term = len(invertedIndex[eachKey][1])
        
        if term == 0:
            term = 1
        
        IDF = math.log2(6400/term)
        
        rowVector[eachKey] = TF*IDF
               
    if score >= 4:
        rowVector["<label>"] = "good"
    else:
        rowVector["<label>"] = "bad"
        
        
    BigramTFIDFVM.append(rowVector)
        
        
    
    

In [9]:
data.apply(makeInvertedIndex, axis = 1)
data.apply(getBigramTFIDFModel, axis = 1)

0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
1970    None
1971    None
1972    None
1973    None
1974    None
1975    None
1976    None
1977    None
1978    None
1979    None
1980    None
1981    None
1982    None
1983    None
1984    None
1985    None
1986    None
1987    None
1988    None
1989    None
1990    None
1991    None
1992    None
1993    None
1994    None
1995    None
1996    None
1997    None
1998    None
1999    None
Length: 2000, dtype: object

In [11]:
DF_bigramTFIDF_VM = pd.DataFrame(BigramTFIDFVM)

In [12]:
DF_bigramTFIDF_VM.to_csv("BigramTFIDF_Vector_Model.csv")