# ADS 509 Assignment 2.1: Tokenization, Normalization, Descriptive Statistics 

## Halee Staggs
### Disclaimer: This assignment was aided by ChatGPT4o. All code was verified for accuracy and the code blocks are commented where this tool was used.

In [1]:
import os
import re
#!pip install emoji
import emoji
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
from nltk.corpus import stopwords
from string import punctuation
import nltk

# Download the stopwords if not already downloaded
#nltk.download('stopwords')

# Load stopwords
sw = stopwords.words("english")

In [2]:
# Add any additional import statements you need here
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
import string
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Location of data on machine
data_location = r"C:\Users\Halee\Downloads\M1AssignmentData"

# Define subfolders
twitter_folder = os.path.join(data_location, "M1Results", "twitter")
lyrics_folder = os.path.join(data_location, "M1Results", "lyrics")

print("Twitter folder path:", twitter_folder)
print("Lyrics folder path:", lyrics_folder)

Twitter folder path: C:\Users\Halee\Downloads\M1AssignmentData\M1Results\twitter
Lyrics folder path: C:\Users\Halee\Downloads\M1AssignmentData\M1Results\lyrics


In [4]:
# Function to output desc stats
# CODE ASSISTED BY CHATGPT4O
def descriptive_stats(tokens, num_tokens=5, verbose=True):

    # Given a list of tokens...
    total_tokens = len(tokens)  # Number of tokens
    unique_tokens = len(set(tokens))  # Number of unique tokens
    num_characters = sum(len(token) for token in tokens)  # Numbers of characters
    lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0  # Lexical diversity
    
    # Calculate the most common tokens
    token_counts = Counter(tokens)
    most_common_tokens = token_counts.most_common(num_tokens)
    
    # Set up statement syntax
    if verbose:
        print(f"There are {total_tokens} tokens in the data.")
        print(f"There are {unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The {num_tokens} most common tokens are:")
        for token, count in most_common_tokens:
            print(f"'{token}': {count} times")
    
    # Return list of values
    return [total_tokens, unique_tokens, lexical_diversity, num_characters]

In [5]:
text = """here is some example text with other example text here in this text""".split()
assert(descriptive_stats(text, verbose=True)[0] == 13)
assert(descriptive_stats(text, verbose=False)[1] == 9)
assert(abs(descriptive_stats(text, verbose=False)[2] - 0.69) < 0.02)
assert(descriptive_stats(text, verbose=False)[3] == 55)


There are 13 tokens in the data.
There are 9 unique tokens in the data.
There are 55 characters in the data.
The lexical diversity is 0.692 in the data.
The 5 most common tokens are:
'text': 3 times
'here': 2 times
'example': 2 times
'is': 1 times
'some': 1 times


Q: Why is it beneficial to use assertion statements in your code? 

A: It helps to flag errors in the code and enhance code quality. 

## Data Input

In [4]:
# CODE ASSISTED BY CHATGPT4O
# Create function to combine lyrics data into one dataframe
def combine_lyrics_to_dataframe(directory):
    data = []  # Create empty list to append data to

    # Iterate through all files in the directory
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):  # Specify to find text files with lyric data
                try:
                    # Get the artist name from the parent folder
                    artist = os.path.basename(root)

                    # Read the content of the file
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        lines = f.readlines()

                    # Get the song name from the first line of the file
                    songname = lines[0].strip()

                    # Combine the rest of the lines into lyrics
                    lyrics = ''.join(lines[1:]).strip()

                    # Append the data to the list
                    data.append([artist, songname, lyrics])

                except Exception as e:
                    print(f"Error processing file {file}: {e}")  # Add bug readout

    # Create the DataFrame
    if data:
        df = pd.DataFrame(data, columns=['artist', 'songname', 'lyrics'])
    else:
        df = pd.DataFrame(columns=['artist', 'songname', 'lyrics'])

    return df

In [5]:
# Compile lyrics by applying function to specified file path of lyric data
lyrics_df = combine_lyrics_to_dataframe(lyrics_folder)
lyrics_df  # Confirm both artists were added

Unnamed: 0,artist,songname,lyrics
0,cher,"""88 Degrees""","Stuck in L.A., ain't got no friends \nAnd so H..."
1,cher,"""A Different Kind Of Love Song""",What if the world was crazy and I was sane\nWo...
2,cher,"""After All""","Well, here we are again\nI guess it must be fa..."
3,cher,"""Again""",Again evening finds me at your door \nHere to ...
4,cher,"""Alfie""","What's it all about, Alfie?\nIs it just for th..."
...,...,...,...
415,robyn,"""We Dance To The Beat""",We dance to the beat\nWe dance to the beat\nWe...
416,robyn,"""Where Did Our Love Go""",Thoughts about you and me \nThinkin' about wha...
417,robyn,"""Who's That Girl""",Good girls are pretty like all the time\nI'm j...
418,robyn,"""With Every Heartbeat""",Maybe we could make it all right\nWe could mak...


In [8]:
# CODE ASSISTED BY CHATGPT4O
# Creat function to read in Twitter data
def combine_twitter_to_dataframe(directory):
    data = []

    # Iterate through all files in the directory
    for root, _, files in os.walk(directory):
        for file in files:
            if "_data" in file and file.endswith(".txt"):  # Confirm files are text files that contain follower data
                try:
                    # Read the content of the file as tab-delimited, skipping bad lines
                    file_path = os.path.join(root, file)
                    
                    # Read in files as tab-delimited dataframe
                    # Skip lines that have missing data or abnormal structure
                    df_temp = pd.read_csv(file_path, delimiter='\t', on_bad_lines = 'skip')

                    # Extract the artist name from the file name since its the first keyword 
                    artist = file.split('_')[0]

                    # Check if 'description' column exists in the df_temp
                    if 'description' in df_temp.columns:
                        # Append the artist name and the description column to the data list
                        for description in df_temp['description']:
                            data.append([artist, description])
                    else:
                        print(f"Column 'description' not found in file {file}")

                except Exception as e:
                    print(f"Error processing file {file}: {e}")

    # Create the final DataFrame
    if data:
        df = pd.DataFrame(data, columns=['artist', 'description'])
    else:
        df = pd.DataFrame(columns=['artist', 'description'])

    return df

In [14]:
# Compile twitter by applying function to specified file path of twitter data
twitter_df = combine_twitter_to_dataframe(twitter_folder)
twitter_df  # Confirm both artists were added along with descriptions

Unnamed: 0,artist,description
0,cher,
1,cher,ùôøùöõùöòùöûùöç ùöúùöûùöôùöôùöòùöõùöùùöéùöõ ùöòùöè ùöñùöéùöúùöúùö¢ ùöãùöûùöóùöú & ùöïùöéùöêùöêùöíùöóùöêùöú
2,cher,163„éùÔºèÊÑõ„Åã„Å£„Å∑üíú26Ê≠≥üçí Â∑•„ÄáÂ•Ω„Åç„Å™Â•≥„ÅÆÂ≠êüíì „Éï„Ç©„É≠„Éº„Åó„Å¶„Åè„Çå„Åü„ÇâDM„Åó„Åæ„Åôüß°
3,cher,csu
4,cher,Writer @Washinformer @SpelmanCollege alumna #D...
...,...,...
4268136,robynkonichiwa,"singer of songs, type 1 diabetic, tired $jakel..."
4268137,robynkonichiwa,Dadx2/ Con-Arch/ Photographer/ DK #stemgr√∏nnes...
4268138,robynkonichiwa,A year to change a life is still a year ‚ú®üòå
4268139,robynkonichiwa,Head of Consumer - Mango. Made in Melbourne. R...


## Data Cleaning

In [6]:
# CODE ASSISTED BY CHATGPT4O
# Function to creat clean data
def clean_and_tokenize(text):
    # Explicitly remove apostrophes and quotes
    additional_punctuation = "‚Äô‚Äò‚Äú‚Äù"  

    # Replace additional punctuation with empty string
    for char in additional_punctuation:
        text = text.replace(char, "")
        
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Split on whitespace and fold to lowercase
    words = word_tokenize(text.lower())
    
    # Remove stopwords
    words = [word for word in words if word not in sw]
    
    return words

In [15]:
# Update Twitter data type to string
twitter_df['description'] = twitter_df['description'].astype('str')

# Update alternative NA descriptions to NA
twitter_df['description'] = twitter_df['description'].replace(['nan', 'NaN'], pd.NA)

# Fill in any missing values with empty strings
twitter_df['description'] = twitter_df['description'].fillna('')

In [16]:
# Apply text cleaning to twitter data
twitter_df['clean_text'] = twitter_df['description'].apply(clean_and_tokenize)

In [7]:
# Update lyrics data type to string
lyrics_df['lyrics'] = lyrics_df['lyrics'].astype('str')

# Fill in any missing values with empty strings
lyrics_df['lyrics'] = lyrics_df['lyrics'].fillna('')

# Apply text cleaning to lyrics data
lyrics_df['clean_text'] = lyrics_df['lyrics'].apply(clean_and_tokenize)

## Examine Clean Text Data

### Twitter Before/After

In [17]:
twitter_df['description'].head()

0                                                     
1             ùôøùöõùöòùöûùöç ùöúùöûùöôùöôùöòùöõùöùùöéùöõ ùöòùöè ùöñùöéùöúùöúùö¢ ùöãùöûùöóùöú & ùöïùöéùöêùöêùöíùöóùöêùöú
2            163„éùÔºèÊÑõ„Åã„Å£„Å∑üíú26Ê≠≥üçí Â∑•„ÄáÂ•Ω„Åç„Å™Â•≥„ÅÆÂ≠êüíì „Éï„Ç©„É≠„Éº„Åó„Å¶„Åè„Çå„Åü„ÇâDM„Åó„Åæ„Åôüß°
3                                                  csu
4    Writer @Washinformer @SpelmanCollege alumna #D...
Name: description, dtype: object

In [18]:
twitter_df['clean_text'].head()

0                                                   []
1        [ùôøùöõùöòùöûùöç, ùöúùöûùöôùöôùöòùöõùöùùöéùöõ, ùöòùöè, ùöñùöéùöúùöúùö¢, ùöãùöûùöóùöú, ùöïùöéùöêùöêùöíùöóùöêùöú]
2        [163„éùÔºèÊÑõ„Åã„Å£„Å∑üíú26Ê≠≥üçí, Â∑•„ÄáÂ•Ω„Åç„Å™Â•≥„ÅÆÂ≠êüíì, „Éï„Ç©„É≠„Éº„Åó„Å¶„Åè„Çå„Åü„Çâdm„Åó„Åæ„Åôüß°]
3                                                [csu]
4    [writer, washinformer, spelmancollege, alumna,...
Name: clean_text, dtype: object

### Lyrics Before/After

In [15]:
lyrics_df['lyrics'].head()

0    Stuck in L.A., ain't got no friends \nAnd so H...
1    What if the world was crazy and I was sane\nWo...
2    Well, here we are again\nI guess it must be fa...
3    Again evening finds me at your door \nHere to ...
4    What's it all about, Alfie?\nIs it just for th...
Name: lyrics, dtype: object

In [16]:
lyrics_df['clean_text'].head()

0    [stuck, la, aint, got, friends, hollywood, nut...
1    [world, crazy, sane, would, strange, cant, bel...
2    [well, guess, must, fate, weve, tried, deep, i...
3    [evening, finds, door, ask, could, try, dont, ...
4    [whats, alfie, moment, live, whats, sort, alfi...
Name: clean_text, dtype: object

## Basic Descriptive Statistics 

In [20]:
# CODE ASSISTED BY CHATGPT4O
# Create function for desc stats for dataframe of tokens
def descriptive_stats(token_lists, num_tokens=5, verbose=True):

    # Flatten the list of lists
    tokens = [token for sublist in token_lists for token in sublist]

    # Calculate the required values
    total_tokens = len(tokens)
    unique_tokens = len(set(tokens))
    num_characters = sum(len(token) for token in tokens)
    lexical_diversity = unique_tokens / total_tokens if total_tokens > 0 else 0
    
    # Calculate the most common tokens
    token_counts = Counter(tokens)
    most_common_tokens = token_counts.most_common(num_tokens)
    
    if verbose:
        print(f"There are {total_tokens} tokens in the data.")
        print(f"There are {unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The {num_tokens} most common tokens are:")
        for token, count in most_common_tokens:
            print(f"'{token}': {count} times")
    
    return [total_tokens, unique_tokens, lexical_diversity, num_characters]

In [21]:
# Descriptive statistics for Cher - Lyrics
cher_lyric_stats = descriptive_stats(lyrics_df['clean_text'][lyrics_df['artist'] == 'cher'])

There are 35684 tokens in the data.
There are 3684 unique tokens in the data.
There are 169160 characters in the data.
The lexical diversity is 0.103 in the data.
The 5 most common tokens are:
'love': 966 times
'im': 511 times
'know': 480 times
'dont': 430 times
'na': 348 times


In [22]:
# Descriptive statistic for Robyn - Lyrics
robyn_lyric_stats = descriptive_stats(lyrics_df['clean_text'][lyrics_df['artist'] == 'robyn'])

There are 15244 tokens in the data.
There are 2138 unique tokens in the data.
There are 72800 characters in the data.
The lexical diversity is 0.140 in the data.
The 5 most common tokens are:
'know': 305 times
'im': 299 times
'dont': 297 times
'got': 274 times
'love': 269 times


In [23]:
# Descriptive statistics for Cher - Twitter
cher_twitter_stats = descriptive_stats(twitter_df['clean_text'][twitter_df['artist'] == 'cher'])

There are 16141920 tokens in the data.
There are 1679896 unique tokens in the data.
There are 95853361 characters in the data.
The lexical diversity is 0.104 in the data.
The 5 most common tokens are:
'love': 214772 times
'im': 162148 times
'life': 123262 times
'music': 88222 times
'de': 73286 times


In [24]:
# Descriptive statistics for Robyn - Twitter
robyn_twitter_stats = descriptive_stats(twitter_df['clean_text'][twitter_df['artist'] == 'robynkonichiwa'])

There are 1537870 tokens in the data.
There are 268753 unique tokens in the data.
There are 9384426 characters in the data.
The lexical diversity is 0.175 in the data.
The 5 most common tokens are:
'music': 15172 times
'love': 11705 times
'im': 10224 times
'och': 7923 times
'life': 7413 times


Q: How do you think the "top 5 words" would be different if we left stopwords in the data? 

A: If we left stopwords in the data, then the top 5 words would consist of stop words. 

---

Q: What were your prior beliefs about the lexical diversity between the artists? Does the difference (or lack thereof) in lexical diversity between the artists conform to your prior beliefs? 

A: I expected that Cher would look different than Robyn due to her celebrity status, but the content appears to be very similar. 


## Specialty Statistics

In [22]:
assert(emoji.is_emoji("‚ù§Ô∏è"))
assert(not emoji.is_emoji(":-)"))

### Emojis üòÅ

What are the ten most common emojis by artist in the twitter descriptions? 


In [25]:
# Function to extract emojies
def extract_emojis(text):
    return [char for char in text if emoji.is_emoji(char)]

In [26]:
# Extract emojis and hashtags from clean text of descriptions
twitter_df['emojis'] = twitter_df['clean_text'].apply(extract_emojis)

In [27]:
# CODE ASSISTED BY CHATGPT4O
# Ten most common emojis by artist in the twitter descriptions
def most_common_by_artist(df, column, top_n=10):
    common_by_artist = {}
    for artist, group in df.groupby('artist'):
        all_items = [item for sublist in group[column] for item in sublist]
        common_by_artist[artist] = Counter(all_items).most_common(top_n)
    return common_by_artist

common_emojis_by_artist = most_common_by_artist(twitter_df, 'emojis')
print("Ten most common emojis by artist:")
print(common_emojis_by_artist)

Ten most common emojis by artist:
{'cher': [('‚ù§Ô∏è', 14738), ('üè≥Ô∏è\u200düåà', 14196), ('‚ô•', 10243), ('‚ù§', 9683), ('‚ú®', 8410), ('üåà', 5500), ('üá∫üá∏', 3738), ('üíô', 3709), ('üíú', 3511), ('üåä', 3303)], 'robynkonichiwa': [('üè≥Ô∏è\u200düåà', 1710), ('‚ô•', 1170), ('‚ù§Ô∏è', 991), ('‚ú®', 756), ('‚ù§', 655), ('üåà', 572), ('üé∂', 273), ('üéß', 214), ('üñ§', 212), ('üíú', 207)]}


### Hashtags

What are the ten most common hashtags by artist in the twitter descriptions? 


In [28]:
# Function to extract hashtags
def extract_hashtags(text):
    return re.findall(r'#\w+', text)

In [29]:
# Update clean text to string data type
twitter_df['description'] = twitter_df['description'].astype('str')
twitter_df['hashtags'] = twitter_df['description'].apply(extract_hashtags)

In [30]:
# CODE ASSISTED BY CHATGPT4O
# Ten most common hashtags by artist in the twitter descriptions
common_hashtags_by_artist = most_common_by_artist(twitter_df, 'hashtags')
print("Ten most common hashtags by artist:")
print(common_hashtags_by_artist)

Ten most common hashtags by artist:
{'cher': [('#BLM', 9532), ('#Resist', 6032), ('#BlackLivesMatter', 4675), ('#resist', 3793), ('#FBR', 3238), ('#TheResistance', 2992), ('#blacklivesmatter', 2645), ('#1', 2633), ('#Resistance', 1915), ('#RESIST', 1821)], 'robynkonichiwa': [('#BlackLivesMatter', 337), ('#BLM', 307), ('#blacklivesmatter', 208), ('#1', 199), ('#music', 174), ('#Music', 113), ('#EDM', 86), ('#LGBTQ', 75), ('#TeamFollowBack', 59), ('#blm', 56)]}


### Song Titles

In [31]:
# CODE ASSISTED BY CHATGPT4O
# Five most common words in song titles by artist
def most_common_words_in_titles(df, column, top_n=5):
    common_words_by_artist = {}
    for artist, group in df.groupby('artist'):
        all_titles = [word for sublist in group[column] for word in sublist]
        common_words_by_artist[artist] = Counter(all_titles).most_common(top_n)
    return common_words_by_artist

# Song titles are in 'songname' column in compiled lyrics dataframe
lyrics_df['cleaned_titles'] = lyrics_df['songname'].apply(clean_and_tokenize)
common_words_in_titles_by_artist = most_common_words_in_titles(lyrics_df, 'cleaned_titles')
print("Five most common words in song titles by artist:")
print(common_words_in_titles_by_artist)

Five most common words in song titles by artist:
{'cher': [('love', 38), ('man', 12), ('song', 11), ('dont', 10), ('come', 7)], 'robyn': [('love', 6), ('dont', 4), ('u', 4), ('thing', 3), ('girl', 3)]}


### Song Lengths

Q: What does the regular expression `'\s+'` match on? 

A: One or more spaces. 


In [8]:
collapse_whitespace = re.compile(r'\s+')

def tokenize_lyrics(lyric) : 
    """strip and split on whitespace"""
    return([item.lower() for item in collapse_whitespace.split(lyric)])

In [9]:
# Update clean text to string data type
lyrics_df['clean_text'] = lyrics_df['clean_text'].astype('str')

# Apply to lyrics clean text
lyrics_df['clean_space'] = lyrics_df['clean_text'].apply(tokenize_lyrics)

In [10]:
# Add song length variable
lyrics_df['song_lengths'] = lyrics_df['clean_space'].apply(len)

In [11]:
lyrics_df['song_lengths'].head()

0    180
1    133
2    120
3     34
4     66
Name: song_lengths, dtype: int64

In [12]:
cher_length = lyrics_df['song_lengths'][lyrics_df['artist'] == 'cher']
robyn_length = lyrics_df['song_lengths'][lyrics_df['artist'] == 'robyn']

In [13]:
cher_length.shape

(316,)

In [14]:
robyn_length.shape

(104,)

In [None]:
# Initialize the plot
plt.figure(figsize=(12, 6))
plt.hist(cher_length)
# Add title and labels
plt.title('Song Lengths for Cher')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
# Show the plot
plt.show()

In [None]:
# Initialize the plot
plt.figure(figsize=(12, 6))
plt.hist(robyn_length)
# Add title and labels
plt.title('Song Lengths for Robyn')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
# Show the plot
plt.show()