# LAB 3 - Search Terms Revisited

In [1]:
import pandas as pd
import re
import numpy as np

## Opening the .CSV file

In [2]:
"""
Opening the .csv file
"""
different_files = ['searchTerms.csv', '10xFileReal.csv', '100xFile.csv']
search_terms = []
with open(different_files[0], encoding='utf-8') as csv_file:
    csv_file.readline()
    for line in csv_file:
        
        line_contents = line.split(',')
        
        search_terms.append(line_contents[0])
    print(f'Printing first 100 search terms to see what it looks like:\n {search_terms[0:100]}')
    
    

Printing first 100 search terms to see what it looks like:
 ['36969', 'CMED 500100', 'KEND 5750', 'CMED 980228', 'DYNC1815H', 'DYND70642', 'DEES KC-21400', 'LINK PC1000', '7081714', 'KEND 8507SA', 'KEND 8881-892910', 'bacon', 'pineapple', '5065265', 'enfit70550', 'cheese cheddar', '68010', '55507', '8116055', '2366607', 'buttermilk', '3009697', '4185775', '1953358', '2157315', '4782694', '6653558', '7062615', 'milk', 'chicken breast', 'DRIT 0028', '4944450', 'romain', 'banana', '16sl', 'URO51211CH', 'HUDS 00640', 'bacon', '7024755', '6056105', '6928832', 'name%20tags', '3029404', 'cut fruit', 'HUDS 003-40', 'milk', '4828182', '1448950', 'biscuit', '101460', '269110', '4549099', '5 way', '260119', '314828', '888340', '241571', 'milk', '101420', '272180', '596060', '4019139', 'GERI HCS4485', 'bacon', 'mash', 'liquid egg', '3602786', '1009711', 'wipes', 'DYND11756H', 'bacon', '4252104', '874302', '4066353', 'creamer', '3778925', '2105850', '3602976', '4908299', 'HUDS 1885', 'lettuce', 'ap

## Replacing bad tokens and fixing them using regex

In [3]:
def clean_token(token):
    """
    Replace '%20' with a " "
    Param: dirty token with '%20'
    Return: Clean token without "%20"
    """
    token = token.replace('%20', ' ')
    return token

In [4]:
bad = '%20'
for i in range(len(search_terms)):
    search_terms[i] = search_terms[i].lower()
    if bad in search_terms[i]:
        search_terms[i] = clean_token(search_terms[i])

## Removing spaces from the list

In [5]:
def remove_spaces(search_terms):
    """
    The new and improved function that doesn't care about memory,
    because it is cool
    """
    space = ' '
    new_search_terms = []
    for i in range(len(search_terms)):
        if space in search_terms[i]:
            terms = search_terms[i].split(' ')
            for term in terms:
                new_search_terms.append(term)
        else:
            new_search_terms.append(search_terms[i])
    return new_search_terms

In [6]:
search_terms = remove_spaces(search_terms)

## Making a dataframe with the semi cleaned list

In [7]:
search_terms_df = pd.DataFrame(search_terms, columns=['SEARCH_TERMS'])

In [8]:
search_terms_df.head()

Unnamed: 0,SEARCH_TERMS
0,36969
1,cmed
2,500100
3,kend
4,5750


## Removing number and punctuation functions to be used by pandas apply method

In [9]:
def remove_numbers(token):
    """
    param: dirty token
    return: clean token without numbers
    """
    token = re.sub(r'[0-9]+', '', token)
    
    return token

In [10]:
def remove_punct(token):
    """
    param: dirty token
    return: clean token with no punctuation
    """
    token = re.sub(r'[^\w\s]', '', token)
    
    return token

## Applying the previous methods to the dataframe creating new columns for each

In [11]:
search_terms_df['NO_NUMS'] = search_terms_df['SEARCH_TERMS'].apply(remove_numbers)


In [12]:
search_terms_df['NO_PUNCT'] = search_terms_df['NO_NUMS'].apply(remove_punct)

In [13]:
search_terms_df.head(10)

Unnamed: 0,SEARCH_TERMS,NO_NUMS,NO_PUNCT
0,36969,,
1,cmed,cmed,cmed
2,500100,,
3,kend,kend,kend
4,5750,,
5,cmed,cmed,cmed
6,980228,,
7,dync1815h,dynch,dynch
8,dynd70642,dynd,dynd
9,dees,dees,dees


## Making the spellcheck dictionary

In [14]:
%%time
from spellchecker import SpellChecker
import pattern.en
def spell_check_dictionary(NO_PUNCT):  
    """
    Making the spell check dictionary
    Param: a dictionary to start with misspelled keys
    return spell check dictionary  
    key(misspelled word) 
    value(correct word)
    """
    spell_check_dict = {}
    spell = SpellChecker(language='en', distance=1)
    #Distance if it is more than one than it doesn't finish in under 10 minutes
    #I get bored and cancel it
    # find those words that may be misspelled
    misspelled = spell.unknown(NO_PUNCT)
    print(len(misspelled))

    for word in misspelled:
        if word is not spell.correction(word):
            spell_check_dict[word] = spell.correction(word)
    
    
    
    return spell_check_dict


CPU times: user 2.87 s, sys: 652 ms, total: 3.52 s
Wall time: 3.57 s


In [15]:
clean_list = search_terms_df['NO_PUNCT'].tolist()
spell_check_dict = spell_check_dictionary(clean_list)


7014


## Making a spellchecker function for the pandas dataframe

In [16]:

def spell_checker(token):
    if token in spell_check_dict:
        return spell_check_dict[token]
    else:
        return token

In [17]:
search_terms_df["Spell_checked"] = search_terms_df['NO_PUNCT'].apply(spell_checker)
search_terms_df.head(30)

Unnamed: 0,SEARCH_TERMS,NO_NUMS,NO_PUNCT,Spell_checked
0,36969,,,a
1,cmed,cmed,cmed,med
2,500100,,,a
3,kend,kend,kend,end
4,5750,,,a
5,cmed,cmed,cmed,med
6,980228,,,a
7,dync1815h,dynch,dynch,lynch
8,dynd70642,dynd,dynd,dyed
9,dees,dees,dees,dees


## Spell checking the list now (yes, it takes three methods)

In [18]:
%time
def frequency_dict(search_terms):
    """
    Parameter: list of searchterms
    Return: Dictionary frequency number of words
    Return key: words
    Return values: number of times in list
    """
    seen = set()
    frequency_search_terms = {}
    for i in range(len(search_terms)):
        if search_terms[i] in seen:
            frequency_search_terms[search_terms[i]] += 1
        else:
            frequency_search_terms[search_terms[i]] = 1
            seen.add(search_terms[i])
    
    return frequency_search_terms

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.15 µs


In [19]:
def comparing_spellcheck_dict(sorted_dict):
    """
    Fixing the dictionary and attributing wrong spelling values 
    to correct ones
    param: sorted dictionary
    return: correct dictionary without misspellings
    """
    correct_token = ''
    bad_keys = []
    for key in sorted_dict:
        if key in spell_check_dict:
            
            correct_token = spell_check_dict[key]
            value = sorted_dict[key]
            if correct_token in sorted_dict:
                sorted_dict[correct_token] += value
                bad_keys.append(key)
            
                
    
    for key in bad_keys:
        sorted_dict.pop(key)
    return sorted_dict


In [20]:
def sorting_the_dict(frequency_dict):
    """
    Sorts dictionary by values from high to low
    Param: Dictionary with number values
    Return: Sorted Dictionary values from high to low
    """
    sorted_dict = {}
    marklist = sorted(frequency_dict.items(), key=lambda x:x[1], reverse=True)
    sort_dict = dict(marklist)
    return sort_dict


### Removing empty strings from the list

In [21]:
no_space_list = []
for string in clean_list:
    if string != '':
        no_space_list.append(string)        

# Benchmarking value_counts vs. Frequency Dict creator

### Seeing how long it takes to create a frequency count with a list

In [22]:
%time        
clean_dict = frequency_dict(no_space_list)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs


### Seeing how long it takes the value_counts method to run

In [23]:
%time
search_terms_df['Spell_checked'].value_counts()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


a            166162
chicken       19228
cream         16053
cheese        14009
beef          13564
              ...  
internet          1
swingline         1
calmazine         1
ws                1
ctqx              1
Name: Spell_checked, Length: 10565, dtype: int64

## Results
### As one can see the run times between the two methods is basically the same. My method is just as efficient as the pandas method. My frequency count method took 11 microsecond to run and the pandas library took 11.2 microseconds to run. In the grand scheme of things this is basically the same and this little difference could be attributed to many outside factors.

In [24]:
spellchecked_dict = comparing_spellcheck_dict(clean_dict)

spellchecked_and_cleaned_dict = sorting_the_dict(spellchecked_dict)

In [25]:
import sys
spellchecked_and_cleaned_list = spellchecked_and_cleaned_dict.keys()
total_memory_usage = 0
for word in search_terms:
    total_memory_usage += sys.getsizeof(word)
print(f'Total memory usage is : {total_memory_usage}')
print(f'Memory usage of list: {sys.getsizeof(spellchecked_and_cleaned_list)}')


Total memory usage is : 80588785
Memory usage of list: 48


In [26]:
print(f'memory usage of pandas dataframe: {search_terms_df.memory_usage(deep=True)}')
series = search_terms_df['Spell_checked']
total_memory_other_way = sum([sys.getsizeof(s) for s in series]) + series.memory_usage()
print(f'memory usage of pandas dataframe (Spell Checked) using sys.getsizeof: {total_memory_other_way}')

memory usage of pandas dataframe: Index                 128
SEARCH_TERMS     92340225
NO_NUMS          91021346
NO_PUNCT         91002679
Spell_checked    91156167
dtype: int64
memory usage of pandas dataframe (Spell Checked) using sys.getsizeof: 91156295


## Memory Usage
### As one can see the memory usage of the list is amore efficient than the memory usage of the pandas dataframe. Since memory is quite cheap and doesn't really matter to much nowadays this is a tradeoff that is very much acceptable. 
## Pandas dataframe memory usage for Spell_checked column: 91156167 bytes (86.9333 megabytes)
## List memory usage for spell_checked_list: 80588785  bytes (76.8555 megabytes)

## Pandas dataframe uses 1.13 times more memory.

In [27]:
91156167 / 80588785

1.1311272033695507

# Advantages and disadvantages with pandas:

Performance-wise pandas does a good job and can be as fast as lists and built in methods in python depending on what methods you use and how creative you get. Memory wise, pandas takes up more memory and is less efficient in this case, but knowing that memory is cheap and accessible this doesn't seem to be a big issue. Programming methods that use the apply method are very easy to use. It is very nice that pandas loops through the entire column for you without the programmer having to write extra code for that. If you can apply the method to one token than you are basically done and dopn't have to worry about it anymore. When doing the same process without pandas looping through lists and making sure you have the right indexes etc. can become cumbersome. 