For this assignment I will attempt to subset tweets by users who could reasonably be assumed to be fans of, or at least familiar with, the hip hop group Insane Clown Posse.

In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from pprint import pprint
from collections import OrderedDict

# Ensure NLTK resources are downloaded (if not already)
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:


df = pd.read_csv('vaping_tweets.csv')

# To check the first few rows of your dataframe
print(df.head())


                                               Tweet
0  what the heck in this "time out" bullshit on t...
1  electronic cigarettes | blu cigs | starter kit...
2  @gastro_celtic blu stu still smoking those mor...
3  @dosh22 @starscream_blu  the door. it's just v...
4  fradi abito smoking slim fit fradi blu royal i...


I am starting with the terms "ICP" a commonly used acronym for the group, and "whoop." "Whoop Whoop" is commonly used as a greeting or interjection within the community of ICP fans (often referred to as "Juggalos" or "The Family").

In [3]:
subset = df[df['Tweet'].str.contains('ICP|whoop', case=False, na=False)]

# Display the subset of rows where "Tweet" contains either "ICP" or "whoop"
print(subset.head)

<bound method NDFrame.head of                                                     Tweet
18      chef eddie huang and vaping brand blu are goin...
75      @therealjuicyj @currensy_spitta @ovo40 @chrisd...
610     the next dumbass who whines to me about how na...
1222    @mysticphonk jus come over to do laundry we ca...
2062    screwed around and got a nicotine addiction fr...
...                                                   ...
335248  @keigh_see @foulkesy1 @bamadan78 @angrygeek2 @...
335547  i added a video to a @youtube playlist https:/...
337617  we're not even supposed to rly be outside but ...
337714  vapeshlurp whoop whoop!! 😤🦁💨💨   #aspire #beard...
337787  federal government supplies dan andrews vic go...

[411 rows x 1 columns]>


I will use the "compare_corpora" function I built for our group comparison assignment, modified to remove usernames in tweets.

In [4]:
def compare_corpora(corpus_1, corpus_2, num_words, ratio_cutoff):
    """
    Compares two corpora and returns a dictionary with various statistics and comparison results.
    
    Parameters:
    corpus_1 (str): The first corpus.
    corpus_2 (str): The second corpus.
    num_words (int): The number of top most frequent words to consider for comparison.
    ratio_cutoff (float): The frequency cutoff for considering tokens in comparison.
    
    Returns:
    dict: A dictionary containing the following keys:
        - "one": Statistics for corpus_1.
        - "two": Statistics for corpus_2.
        - "one_vs_two": Comparison of top words in corpus_1 to corpus_2.
        - "two_vs_one": Comparison of top words in corpus_2 to corpus_1.
    """
    
    def preprocess_text(corpus):
        # Tokenize by splitting on white space
        tokens = word_tokenize(corpus.lower())  # Tokenize and lowercase

        # Remove words that start with "@"
        tokens = [word for word in tokens if not word.startswith('@')]

        # Remove punctuation from the remaining tokens
        tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]  # Remove punctuation

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        return filtered_tokens
    
    def calculate_corpus_statistics(tokens, num_words):
        # Total number of tokens
        tokens_count = len(tokens)
        
        # Number of unique tokens
        unique_tokens_count = len(set(tokens))
        
        # Average token length
        avg_token_length = sum(len(token) for token in tokens) / tokens_count if tokens_count > 0 else 0
        
        # Lexical diversity
        lexical_diversity = unique_tokens_count / tokens_count if tokens_count > 0 else 0
        
        # Top most frequent words
        top_words = Counter(tokens).most_common(num_words)
        
        return {
            "tokens": tokens_count,
            "unique_tokens": unique_tokens_count,
            "avg_token_length": avg_token_length,
            "lexical_diversity": lexical_diversity,
            "top_words": top_words
        }

    def calculate_use_ratio(token_counts, total_tokens, ratio_cutoff):
        # Calculate the use_ratio for tokens that occur at least `ratio_cutoff` times
        use_ratios = {token: count / total_tokens for token, count in token_counts.items() if count >= ratio_cutoff}
        return use_ratios

    # Preprocess both corpora
    tokens_1 = preprocess_text(corpus_1)
    tokens_2 = preprocess_text(corpus_2)
    
    # Calculate statistics for both corpora
    corpus_1_stats = calculate_corpus_statistics(tokens_1, num_words)
    corpus_2_stats = calculate_corpus_statistics(tokens_2, num_words)
    
    # Calculate frequency of tokens in both corpora
    token_counts_1 = Counter(tokens_1)
    token_counts_2 = Counter(tokens_2)
    
    # Calculate use_ratio for tokens in each corpus that occur at least `ratio_cutoff` times
    use_ratios_1 = calculate_use_ratio(token_counts_1, corpus_1_stats["tokens"], ratio_cutoff)
    use_ratios_2 = calculate_use_ratio(token_counts_2, corpus_2_stats["tokens"], ratio_cutoff)
    
    # Create dictionaries for 1v2 and 2v1
    ratio_1v2 = {}
    ratio_2v1 = {}

    # Calculate the use_ratio comparison for each corpus
    for token, use_ratio in use_ratios_1.items():
        if token in use_ratios_2:
            ratio_1v2[token] = use_ratio / use_ratios_2[token]

    for token, use_ratio in use_ratios_2.items():
        if token in use_ratios_1:
            ratio_2v1[token] = use_ratio / use_ratios_1[token]

    # Sort the ratios from highest to lowest
    sorted_ratio_1v2 = sorted(ratio_1v2.items(), key=lambda x: x[1], reverse=True)
    sorted_ratio_2v1 = sorted(ratio_2v1.items(), key=lambda x: x[1], reverse=True)


    # Store the top `num_words` words with the highest ratios in the dictionaries using OrderedDict
    dict_1v2 = OrderedDict(sorted_ratio_1v2[:num_words])
    dict_2v1 = OrderedDict(sorted_ratio_2v1[:num_words])

    # Create the final dictionary with the specified structure
    result_dict = {
        "one": corpus_1_stats,
        "two": corpus_2_stats,
        "one_vs_two": dict_1v2,
        "two_vs_one": dict_2v1
    }

    return result_dict





Here is my initial comparison and result:

In [5]:
# Separate the rows in df that are not in subset
non_subset = df[~df.index.isin(subset.index)]

# Combine the "Tweet" text into strings for each corpus
corpus_1 = " ".join(subset["Tweet"].tolist())
corpus_2 = " ".join(non_subset["Tweet"].tolist())

# Use your compare_corpora function to compare the corpora
result = compare_corpora(corpus_1, corpus_2, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)


{'one': {'avg_token_length': 5.889001888711317,
         'lexical_diversity': 0.2888275461281418,
         'tokens': 13766,
         'top_words': [('', 2854),
                       ('https', 214),
                       ('juul', 168),
                       ('smoking', 136),
                       ('vape', 80),
                       ('nicotine', 73),
                       ('vaping', 71),
                       ('doo', 61),
                       ('get', 52),
                       ('amp', 44)],
         'unique_tokens': 3976},
 'one_vs_two': OrderedDict([('kayzomusic', 2614.4375635624006),
                            ('craysounds_', 2614.4375635624006),
                            ('quackhouse', 2614.4375635624006),
                            ('leespielman', 2556.338951038791),
                            ('keigh_see', 2033.4514383263113),
                            ('hi_mija', 1742.9583757082669),
                            ('fantasticradiouk', 1646.1273548355853),
             

Next, I will build a new subset incorporating three words with unusually high usage ratios in the first subset.

In [9]:
# Create the first subset with rows containing any of the specified keywords
subset_with_keywords = df[
    df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman', case=False, na=False)
]

# Create the second subset with rows that do not contain the specified keywords
subset_without_keywords = df[
    ~df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman', case=False, na=False)
]


And run the compare corpora function on my new subsets:

In [12]:
# Combine the "Tweet" text from each subset into a single string
corpus_3 = " ".join(subset_with_keywords["Tweet"].tolist())
corpus_4 = " ".join(subset_without_keywords["Tweet"].tolist())

# Run the compare_corpora function on the two corpora
result = compare_corpora(corpus_3, corpus_4, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)

{'one': {'avg_token_length': 5.940816756870738,
         'lexical_diversity': 0.28270190482884655,
         'tokens': 14227,
         'top_words': [('', 2892),
                       ('https', 214),
                       ('juul', 177),
                       ('smoking', 137),
                       ('vape', 85),
                       ('nicotine', 79),
                       ('vaping', 73),
                       ('doo', 61),
                       ('get', 53),
                       ('amp', 49)],
         'unique_tokens': 4022},
 'one_vs_two': OrderedDict([('diplo', 2569.7276606854175),
                            ('kidcudi', 2248.51170309974),
                            ('keigh_see', 1967.4477402122723),
                            ('garethemery', 1798.8093624797923),
                            ('arminvanbuuren', 1798.8093624797923),
                            ('ashwallbridge', 1798.8093624797923),
                            ('ekalimusic', 1606.0797879283857),
                  

That was so much fun! Let's do it again!

In [13]:
# Create the first subset with rows containing any of the specified keywords
subset_with_2 = df[
    df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman|diplo|kidcudi', case=False, na=False)
]

# Create the second subset with rows that do not contain the specified keywords
subset_without_2 = df[
    ~df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman|diplo|kidcudi', case=False, na=False)
]


In [14]:
# Combine the "Tweet" text from each subset into a single string
corpus_5 = " ".join(subset_with_2["Tweet"].tolist())
corpus_6 = " ".join(subset_without_2["Tweet"].tolist())

# Run the compare_corpora function on the two corpora
result = compare_corpora(corpus_5, corpus_6, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)

{'one': {'avg_token_length': 5.9903119550474715,
         'lexical_diversity': 0.27669056384421625,
         'tokens': 15483,
         'top_words': [('', 3061),
                       ('https', 222),
                       ('juul', 186),
                       ('smoking', 151),
                       ('vape', 85),
                       ('nicotine', 82),
                       ('vaping', 77),
                       ('doo', 61),
                       ('amp', 58),
                       ('get', 56)],
         'unique_tokens': 4284},
 'one_vs_two': OrderedDict([('illgatesmusic', 4131.5703674998385),
                            ('ovo40', 1877.9865306817446),
                            ('asvpxrocky', 1877.9865306817446),
                            ('keigh_see', 1807.5620357811795),
                            ('theslumpgod', 1589.0655259614764),
                            ('madeintyo', 1589.0655259614764),
                            ('2chainz', 1475.5608455356564),
                    

ONE MORE TIME!!!!

In [20]:
# Create the first subset with rows containing any of the specified keywords
subset_with_3 = df[
    df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman|diplo|kidcudi|illgatesmusic|ovo40|asvpxrocky|keigh_see', case=False, na=False)
]

# Create the second subset with rows that do not contain the specified keywords
subset_without_3 = df[
    ~df['Tweet'].str.contains('ICP|whoop|kayzomusic|craysounds_|quackhouse|leespielman|diplo|kidcudi|illgatesmusic|ovo40|asvpxrocky|keigh_see', case=False, na=False)
]


In [22]:
# Combine the "Tweet" text from each subset into a single string
corpus_7 = " ".join(subset_with_3["Tweet"].tolist())
corpus_8 = " ".join(subset_without_3["Tweet"].tolist())

# Run the compare_corpora function on the two corpora
result = compare_corpora(corpus_7, corpus_8, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)

{'one': {'avg_token_length': 6.143236875490848,
         'lexical_diversity': 0.2701020962967438,
         'tokens': 16553,
         'top_words': [('', 3125),
                       ('https', 224),
                       ('juul', 202),
                       ('smoking', 158),
                       ('nicotine', 90),
                       ('vape', 90),
                       ('vaping', 77),
                       ('amp', 66),
                       ('doo', 61),
                       ('get', 58)],
         'unique_tokens': 4471},
 'one_vs_two': OrderedDict([('devonsmillie', 4636.782021385851),
                            ('tonykay69', 4636.782021385851),
                            ('chadkerley', 3944.484705692825),
                            ('richbrian', 3477.5865160393887),
                            ('chrisdelia', 2958.3635292696185),
                            ('hicksfilmedit', 2511.590261584003),
                            ('ibluestone', 2221.791385247387),
                  

I appear to have unintentionally built an "Other artists you might like" function... Let's see if it works with a different group:

In [28]:
# Create the first subset with rows containing any of the specified keywords
subset_with = df[
    df['Tweet'].str.contains('Migos', case=False, na=False)
]

# Create the second subset with rows that do not contain the specified keywords
subset_without = df[
    ~df['Tweet'].str.contains('Migos', case=False, na=False)
]
# Combine the "Tweet" text from each subset into a single string
corpus_9 = " ".join(subset_with["Tweet"].tolist())
corpus_10 = " ".join(subset_without["Tweet"].tolist())

# Run the compare_corpora function on the two corpora
result = compare_corpora(corpus_9, corpus_10, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)

{'one': {'avg_token_length': 4.0064,
         'lexical_diversity': 0.3792,
         'tokens': 2500,
         'top_words': [('', 684),
                       ('migos', 64),
                       ('katy', 36),
                       ('perry', 36),
                       ('savage', 36),
                       ('https', 34),
                       ('21', 31),
                       ('amp', 22),
                       ('lil', 22),
                       ('logic', 15)],
         'unique_tokens': 948},
 'one_vs_two': OrderedDict([('vert', 1348.9047578947368),
                            ('uzi', 743.7041857142857),
                            ('lamar', 431.2604153846154),
                            ('travis', 374.45245714285716),
                            ('malone', 348.9122455445545),
                            ('kendrick', 251.9723775280899),
                            ('drake', 240.27366),
                            ('cole', 213.57658666666666),
                            ('khalid',

Relatively strong in the "artist suggestion" department. "Uzi" and "Vert" refer to a member of migos. "Lamar" and "kendrick" could refer to popular hip hop artist kendrick lamar. Other possible artist mentions are Travis Scott, Post Malone, Drake, J. Cole, DJ Khalid, and Cardi B. Lets see what one more iteration returns:

In [30]:
# Create the first subset with rows containing any of the specified keywords
subset_with = df[
    df['Tweet'].str.contains('Migos|lamar|travis|malone|kendrick|drake|cole|khalid|cardi', case=False, na=False)
]

# Create the second subset with rows that do not contain the specified keywords
subset_without = df[
    ~df['Tweet'].str.contains('Migos|lamar|travis|malone|kendrick|drake|cole|khalid|cardi', case=False, na=False)
]
# Combine the "Tweet" text from each subset into a single string
corpus_9 = " ".join(subset_with["Tweet"].tolist())
corpus_10 = " ".join(subset_without["Tweet"].tolist())

# Run the compare_corpora function on the two corpora
result = compare_corpora(corpus_9, corpus_10, num_words=10, ratio_cutoff=5)

# Print the comparison result
pprint(result)

{'one': {'avg_token_length': 5.205376380861178,
         'lexical_diversity': 0.20065665720441875,
         'tokens': 58478,
         'top_words': [('', 11360),
                       ('juul', 709),
                       ('smoking', 569),
                       ('https', 566),
                       ('logic', 462),
                       ('vape', 372),
                       ('katy', 331),
                       ('perry', 321),
                       ('x', 284),
                       ('cigarettes', 283)],
         'unique_tokens': 11734},
 'one_vs_two': OrderedDict([('johnferris62', 761.6133109887479),
                            ('ruth91869132', 646.0112905708129),
                            ('pmatist', 639.2111717226992),
                            ('imbalaska', 639.2111717226992),
                            ('bartbow', 634.67775915729),
                            ('iandjbrown2', 600.6771649167209),
                            ('sander_1954', 600.6771649167209),
               

Not a very strong slate of music suggestions. This iteration appears to return all usernames of people mentioned in tweets involving the artists mentioned above. Variations on the "compare_corpora" functioned outlined here could possibly be used to suggest new artists or identify other users with similar music tastes.