In [None]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [None]:
from fastbook import *
from fastai.text.all import *

In [None]:
import pandas as pd

## Practice Creating the Language Model DataLoader

In [None]:
# read and combine the data
lyrics_data = pd.read_csv('sample_data/lyrics-data.csv')
lyrics_data.rename(columns={"ALink": "Link"}, inplace=True)
artist_data = pd.read_csv('sample_data/artists-data.csv')
print(artist_data.columns, lyrics_data.columns)
merged_dfs = lyrics_data.merge(artist_data, how='inner', on='Link')
eng_artists = merged_dfs.loc[merged_dfs['Idiom'] == 'ENGLISH', ['Artist', 'SName', 'Lyric', 'Genre']].drop_duplicates(subset=['SName'])
eng_artists.reset_index(inplace=True, drop=True)

eng_artists.head()

In [None]:
artist_name = ""

artist_df = eng_artists.loc[eng_artists['Artist'] == artist_name].reset_index(drop=True)
artist_df.head()

## Create dataloader using fastai

In [None]:
lang_model_block = DataBlock(
    blocks=TextBlock.from_df('artist_df', seq_len=72, is_lm=True),
    get_items=ColReader('Lyric')
)
lang_model_block

In [None]:
dls_lm = lang_model_block.dataloaders(artist_df, bs=128, seq_len=80)
dls_lm.show_batch(max_n=2)

# Train a model using transfer learning

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.3, metrics=accuracy)
learn.fit_one_cycle(5, 0.004)
learn.unfreeze()
# train for 20 more epochs on the new lr
learn.fit_one_cycle(20, lr_max=slice(3e-6, 3e-4))

# Predictions with Model

In [None]:
def get_most_complex(start_text, preds):
  max_len = 0
  max_i = -1
  for i, pred in enumerate(preds):
    pred_cardinality = len(set(pred.split()))
    if pred_cardinality > max_len:
      max_len = pred_cardinality
      max_i = i
  
  return_str = preds[max_i]

  val = -1
  occurrence = len(start_text.split())
  for i in range(0, occurrence):
    val = return_str.find(' ', val + 1)

  return start_text + return_str[val:return_str.rfind('.')+1]


In [None]:
start_text = ""
words = 60
sentences = 5
preds = [learn.predict(start_text, words, temperature=0.75)
         for sentence in range(sentences)]

get_most_complex(start_text, preds)