In [30]:
import os
import pandas as pd
pd.set_option('display.max_rows', None)
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer


In [31]:
with open('Lyrics_twentyonepilots.json') as file:
    #covert json to python obj
    data = json.load(file)

    #list of dict
    main_list = []

    #store each song on a temp dict and adding it to the main list
    for row in data['songs']:

        temp_dict = {}
        try:
            temp_dict['title'] = row['title']
        except:
            temp_dict['title'] = None
        
        try:
            temp_dict['release_date'] = row['release_date']
        except:
            temp_dict['release_date'] = None
        try:
            temp_dict['album'] = row['album']['name']
        except:
            temp_dict['album'] = None

        try:    
            temp_dict['lyrics'] = row['lyrics']
        except:
            temp_dict['lyrics'] = row['lyrics']

        main_list.append(temp_dict)

In [32]:
#creating the df
df = pd.DataFrame(main_list)

In [33]:
# droping stuff that aren't true albuns or don't have values for albuns
df.dropna(inplace=True)
df.drop(df.index[df['album'].str.contains('LC')], inplace = True)
df.drop(df.index[df['album'].str.contains('Version')], inplace = True)
df.drop(df.index[df['album'].str.contains('Live')], inplace = True)
df.drop(df.index[df['album'].str.contains('EP')], inplace = True)
df.drop(df.index[df['album'].str.contains('Sessions')], inplace = True)
df.drop(df.index[df['album'].str.contains('Edition')], inplace = True)
df.drop(df.index[df['album'].str.contains('MTV')], inplace = True)
df.drop(df.index[df['album'].str.contains('commentary')], inplace = True)
df.drop(df.index[df['album'].str.contains('Presents')], inplace = True)




In [34]:
#gettin only the year for release date
release_date = df.release_date.str.extract(r'(\d{4})')
df.release_date = release_date

In [35]:
#seeing how many songs where launched each year
songs_by_year = df.groupby([df.album, df.release_date]).size().reset_index(name='Songs')
songs_by_year = songs_by_year.sort_values(by='Songs', ascending=False)

In [36]:
#creating my model
vect = CountVectorizer()

In [37]:
#matrix
matrixx = vect.fit_transform(df.lyrics)

In [38]:
#creating a new df for the matrix
musicdf = pd.DataFrame(matrixx.toarray(), index = df.title, columns = vect.get_feature_names())
pd.set_option('display.max_columns', None)

#dropping all columns that contains "embed" within it
musicdf = musicdf[musicdf.columns.drop(list(musicdf.filter(regex='embed')))]



In [39]:
#sumcount

sum_count = matrixx.sum(axis=0)

In [40]:
#vocabulary dict
vocab_dict = vect.vocabulary_

In [41]:
#tuples
toup_vocab = vocab_dict.items()

In [42]:
#master list containing every word and its index
master_list = [(word,sum_count[0, index]) for word, index in toup_vocab]

In [43]:
master_list.sort(key=lambda x: x[1], reverse = True)

In [44]:
#tfid vectorization for the new model (that will help us to get the importance % of each word for the songs)
tfidf_vect = TfidfVectorizer(stop_words = 'english', max_df=4,ngram_range=(1,1))

In [45]:
#matrix/sum/tuples/master/sortin
matrixy = tfidf_vect.fit_transform(df.lyrics)
sum_count1 = matrixy.sum(axis=0)
tupley = tfidf_vect.vocabulary_.items()
#in the master list it'll sum up the importances, so numbers can get higher than 1
master_list1 = [(word,sum_count1[0, index]) for word, index in tupley]
master_list1.sort(key=lambda x: x[1], reverse=True)

In [46]:
master_list1

[('la', 3.381103322690008),
 ('mm', 1.5066597383466922),
 ('care', 1.1504154568364011),
 ('woah', 1.1088108836324027),
 ('fast', 1.1075117870758853),
 ('hey', 1.101555487415411),
 ('follow', 1.0988607899596738),
 ('hello', 1.0874840249076876),
 ('fake', 1.082215814391542),
 ('haven', 1.0100215700777713),
 ('jumpsuit', 1.0056532295679799),
 ('eh', 0.9764449023803125),
 ('na', 0.974198062147839),
 ('oa', 0.950148993669749),
 ('tonight', 0.9307243760693815),
 ('city', 0.9235799755836377),
 ('bah', 0.9138591118075192),
 ('gold', 0.8908650737319326),
 ('turns', 0.8838187578948782),
 ('street', 0.8795445508840574),
 ('reign', 0.8588765222032317),
 ('sit', 0.856794446628875),
 ('bou', 0.8457340213189966),
 ('heavydirtysoul', 0.8350398528920754),
 ('twisted', 0.7749833062718128),
 ('cheetah', 0.771447843423576),
 ('broken', 0.7702713094249969),
 ('morph', 0.7543894467907298),
 ('tear', 0.7446089411756652),
 ('guns', 0.7434176320093422),
 ('ones', 0.731229588467001),
 ('bounce', 0.7248960680450

In [64]:
#creating my trigram model
from nltk.util import ngrams
from collections import Counter
import random

all_text = ' '.join(musicdf.columns.values)
all_text = all_text.lower()

# Get all trigrams
trigram_model = list(ngrams(all_text.split(), 3))

# Generate trigram frequency
trigram_frequency = Counter(trigram_model)

# Generate a function to get the next word in the sequence
def next_word(trigram):
    possible_words = []
    for tup in trigram_frequency.items():
        if tup[0][0] == trigram[1] and tup[0][1] == trigram[2]:
            possible_words.append(tup)
    if not possible_words:
        return None
    next_word_freq = max(possible_words, key=lambda x: x[1])[0][2]
    return next_word_freq

# Generate lyrics
num_lines = 20
num_words_per_line = 4

# Choose a random starting trigram
start_trigram = random.choice(trigram_model)

for i in range(num_lines):
    line = [start_trigram[1], start_trigram[2]]
    for j in range(num_words_per_line):
        next_word_freq = next_word(start_trigram)
        if not next_word_freq:
            break
        line.append(next_word_freq)
        start_trigram = (start_trigram[1], start_trigram[2], next_word_freq)
    print(' '.join(line))
    start_trigram = random.choice(trigram_model)


wall walls wanna want wanted wants
loyalty luckily lunch lungs lyin lying
my myself na nah name named
having havе hazmat he head headache
flavor flawless flesh flies flightless flip
melt memories men mending mental mention
kitchen knees knew knockin know known
remember remind reminding remove renaissance rent
highest highs highway hills him hip
worse worst worth worthless worthwhile would
those though thought thoughts threaten three
chemical chest child chill chills chin
single singles sink sinning sippin sit
passenger passing past paste path patiently
rabbit race racehorse radio rain raindrop
teach tear tearing tears technology teeth
fantasies fantasize far farther fast faster
taught taunts taxes taxi teach tear
soaked socks soft softest soldiers solemn
sills silver simple simply since sincere
