In [1]:
from nltk import word_tokenize

def get_targets(post, brand_dict):
    
    ''' Take individual post and return only the words that appear in the brand dictionary (brands and models) '''
    
    # Tokenize post
    tokens = word_tokenize(post)
    
    # Remove all tokens but brands and models
    targets = [word.lower() for word in tokens if word.lower() in brand_dict.keys() or word.lower() in brand_dict.values()]
    
    return targets



def models_to_brands(targets, brand_dict):
    
    ''' Take list of brands and models and return new list where the models are converted to their respective brands '''
    
    # Create list to store brand names
    brands = []
    
    # Iterate through words
    for word in targets:
        
        # Convert models to brands and add to brand list
        if word in brand_dict.keys():
            brands.append(brand_dict[word])
            
        # Add brands that were mentioned to list
        elif word in brand_dict.values():
            brands.append(word)
        
        else:
            continue
        
    return brands



def get_unique_brands(brand_list):
    
    ''' Take list of brands and return new list that removes duplicates '''
    
    # Create list to store unique brand names
    unique_brands = []
    
    for brand in brand_list:
        if brand not in unique_brands:
            unique_brands.append(brand)
    
    return unique_brands

In [2]:
# Create dictionary mapping models to their respective brands
keys = ['330', 'a4', 'tl-s', '300m', 'x-type', 'i35', 'c-class', 's60', 'is300', 'es300', 'cts', 'g35']
values = ['bmw', 'audi', 'acura', 'chrysler', 'jaguar', 'infiniti', 'mercedes', 'volvo', 'lexus', 'lexus', 'cadillac', 'infiniti']
brand_dict = dict(zip(keys, values))

In [3]:
brand_dict

{'300m': 'chrysler',
 '330': 'bmw',
 'a4': 'audi',
 'c-class': 'mercedes',
 'cts': 'cadillac',
 'es300': 'lexus',
 'g35': 'infiniti',
 'i35': 'infiniti',
 'is300': 'lexus',
 's60': 'volvo',
 'tl-s': 'acura',
 'x-type': 'jaguar'}

In [9]:
import math

def count_brand_frequencies(posts, brand_dict):
    
    ''' Take list of posts and return a dictionary of frequency counts for each brand that was mentioned '''
    
    # Initialize frequency dictionary
    brand_counts = {}
    
    # Iterate through posts
    for post in posts:
        
        post = post.decode("utf8")
        
        # Retrieve brand names and models from post
        targets = get_targets(post, brand_dict)
        
        # Create list of brands that were mentioned in post (models converted to respective brands)
        brands = models_to_brands(targets, brand_dict)
        
        # Retrieve only unique brand names from list of brands that were mentioned
        unique_brands = get_unique_brands(brands)
        
        # Add each brand that was mentioned to total brand count
        for brand in unique_brands:
            if brand not in brand_counts.keys():
                brand_counts[brand] = 1
            else:
                brand_counts[brand] += 1
    
    return brand_counts

In [10]:
import pandas as pd

edmunds = pd.read_csv('Edmunds_Posts.csv')
posts = edmunds['Post'].dropna().tolist()
test = posts[0:204]

In [11]:
# Run on sample
count_brand_frequencies(test, brand_dict)

{u'acura': 25,
 'audi': 45,
 u'bmw': 44,
 'cadillac': 12,
 u'chrysler': 8,
 'infiniti': 42,
 'jaguar': 50,
 u'lexus': 24,
 u'mercedes': 23,
 'volvo': 14}

In [13]:
# Run on entire set of posts
count_brand_frequencies(posts, brand_dict)

{u'acura': 698,
 'audi': 681,
 u'bmw': 2315,
 'cadillac': 708,
 u'chrysler': 73,
 'infiniti': 1512,
 'jaguar': 129,
 u'lexus': 972,
 u'mercedes': 323,
 'volvo': 272}