In [None]:
import pandas as pd
import numpy as np
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt
import pickle as pkl
from scipy import sparse

# Data Visualization

import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud, STOPWORDS

# Text Processing
import re
import itertools
import string
import collections
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Machine Learning packages
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.cluster as cluster
from sklearn.manifold import TSNE
import joblib

# Model training and evaluation
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance

#Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

#extract lyrics
import lyricsgenius

In [None]:
#eda
p2_survey = pd.read_csv('../python/mbtidata.csv')
#p2 = pd.read_csv('../python/p2_lyric1000.csv')
#remove unwanted words in order to get valid model accuracy estimation for unseen data. 
remove_words = '|'.join(['Chorus', 'Lyrics', 'Intro', 'Verse','Outro','Post-Chorus:','Pre-Chorus', 'Embed','Bridge'])
p2_survey["lyrics"] = p2_survey["lyrics"].str.replace(remove_words, '')
p2_survey.head()

In [None]:
#p2_survey ['all_lyrics'] = new_df['lyrics']
#p2_survey['MBTI'] = p2['MBTI_Type'].str.split('-').str[0]
p2_survey.head()

In [None]:
p2_survey.info()

In [None]:
#size of dataset
nRow, nCol = p2_survey.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
p2_survey.dtypes

In [None]:
p2_survey = p2_survey[['MBTI', 'TopSongs','lyrics']]
p2_survey.head()

In [None]:

#all values are textual, hence they have to be converted to numerical form to train the ML model
p2_survey.describe(include=['object'])

In [None]:
#finding the unique values from the 'MBTI_Type' of personality column
types = np.unique(np.array(p2_survey['MBTI']))
types


In [None]:
total = p2_survey.groupby(['MBTI']).count()*50
total

In [None]:
plt.figure(figsize = (12,4))
plt.bar(np.array(total.index), height = total['lyrics'],)
plt.xlabel('Personality types', size = 14)
plt.ylabel('No. of lyrics count available', size = 14)
plt.title('Total lyrics count for each personality type')

In [None]:
import seaborn as sns 
#Plotting this in descending order for better understanding of this visualization
cnt_srs = p2_survey['MBTI'].value_counts()
plt.figure(figsize=(12,4))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
plt.xlabel('Personality types', fontsize=12)
plt.ylabel('No. of lyrics availables', fontsize=12)
plt.show()

# the most common users personality is 
INFP (Introvert Intuition Feeling Perceiving).

#can consider for now that users who's willing to fill up the survey are more intoverted, perceptive, and emotional.

In [None]:
#p2.tail()
p2_survey['lyrics'].isnull().values.any()

In [None]:
#p2_survey[p2_survey['lyrics'].isna()] # print row with missing values

In [None]:
#p2['lyric']=p2['lyric'].fillna("") # change nan to " "

In [None]:
#Finding the most common words in all posts.
words = list(p2_survey["lyrics"].apply(lambda x: x.split()))
words = [x for y in words for x in y]
Counter(words).most_common(40)

In [None]:
#Plotting the most common words with WordCloud.
wc = wordcloud.WordCloud(width=1200, height=500, 
                         collocations=False, background_color="white", 
                         colormap="tab20b").generate(" ".join(words))

# collocations to False  is set to ensure that the word cloud doesn't appear as if it contains any duplicate words
plt.figure(figsize=(25,10))
# generate word cloud, interpolation 
plt.imshow(wc, interpolation='bilinear')
_ = plt.axis("off")

In [None]:
fig, ax = plt.subplots(len(p2_survey['MBTI'].unique()), sharex=True, figsize=(15,len(p2_survey['MBTI'].unique())))
k = 0
for i in p2_survey['MBTI'].unique():
    df_4 = p2_survey[p2_survey['MBTI'] == i]
    wordcloud = WordCloud(max_words=1628,relative_scaling=1,normalize_plurals=False).generate(df_4['lyrics'].to_string())
    plt.subplot(4,4,k+1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(i)
    ax[k].axis("off")
    k+=1

#model

In [None]:
data = p2_survey[['MBTI','lyrics']]
data.tail()

In [None]:
data.info()

In [None]:
# add columns for personality type indicators
def get_types(row):
    t=row['MBTI']

    I = 0; N = 0
    T = 0; J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('I-E not found') 
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('N-S not found')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('T-F not found')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('J-P not found')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

data = data.join(data.apply (lambda row: get_types (row),axis=1))
data.head(5)

In [None]:
#Counting No. of posts in one class / Total no. of posts in the other class

print ("Introversion (I) /  Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
print ("Intuition (N) / Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
print ("Thinking (T) / Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
print ("Judging (J) / Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])

In [None]:
#Plotting the distribution of each personality type indicator
N = 4
bottom = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])

ind = np.arange(N)    # the x locations for the groups
# the width of the bars
width = 0.7           # or len(x) can also be used here

p1 = plt.bar(ind, bottom, width, label="I, N, T, F")
p2 = plt.bar(ind, top, width, bottom=bottom, label="E, S, F, P") 

plt.title('Distribution accoss types indicators')
plt.ylabel('Count')
plt.xticks(ind, ('I / E',  'N / S', 'T / F', 'J / P',))
plt.legend()

plt.show()

In [None]:
#Remove posts with less than X words
#min_words = 15
#print("Before : Number of posts", len(data)) 
data["no. of. words"] = data["lyrics"].apply(lambda x: len(re.findall(r'\w+', x)))
#data = data[data["no. of. words"] >= min_words]
data.sample(7)

In [None]:
lemmatiser = WordNetLemmatizer()

# Remove the stop words for speed 
useless_words = stopwords.words("english")

In [None]:
data.sample(3)

In [None]:
# Binarizing the each personality type feature
# Splitting the MBTI personality into 4 letters and binarizing it

b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    return [b_Pers[l] for l in personality]

#To show result output for personality prediction
def translate_back(personality):
    # transform binary vector to mbti personality
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s

list_personality_bin = np.array([translate_personality(p) for p in data.MBTI])
print("Binarize MBTI list: \n%s" % list_personality_bin)

In [None]:
data.lyrics[1]

In [None]:
# Cleaning of data in the lyric
def pre_process_text(data, remove_stop_words=True):
    list_personality = []
    list_lyrics = []
    len_data = len(data)
    i=0
      
    for row in data.iterrows():
        #Remove and clean comments
        lyrics = row[1].lyrics

        #Remove Non-words - keep only words
        temp = re.sub("[^a-zA-Z]", " ", lyrics)

        # Remove spaces > 1
        temp = re.sub(' +', ' ', temp).lower()

        #Remove multiple letter repeating words
        temp = re.sub(r'([a-z])\1{2,}[\s|\w]*', '', temp)

        #Remove stop words
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in useless_words])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])


      # transform mbti to binary vector
        type_labelized = translate_personality(row[1].MBTI) #or use lab_encoder.transform([row[1].type])[0]
        list_personality.append(type_labelized)
        # the cleaned data temp is passed here
        list_lyrics.append(temp)

  # returns the result
    list_lyrics = np.array(list_lyrics)
    list_personality = np.array(list_personality)
    return list_lyrics, list_personality

list_lyrics, list_personality  = pre_process_text(data, remove_stop_words=True)

print("Example :")
print("\nLyrics before preprocessing:\n\n", data.lyrics[0])
print("\nLyrics after preprocessing:\n\n", list_lyrics[0])
print("\nMBTI before preprocessing:\n\n", data.MBTI[0])
print("\nMBTI after preprocessing:\n\n", list_personality[0])

In [None]:
nRow, nCol = list_personality.shape
print(f'No. of posts = {nRow}  and No. of Personalities = {nCol} ')

# Feature Engineering
# Tf–idf

In [None]:
# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer(analyzer="word", 
                             max_features=770,  
                             max_df=0.7,
                             min_df=0.1) 
# the feature should be made of word n-gram 
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_lyrics)

#The enumerate object yields pairs containing a count and a value (useful for obtaining an indexed list)
feature_names = list(enumerate(cntizer.get_feature_names()))
print("10 feature names can be seen below")
print(feature_names[0:10])

# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")

print("Now the dataset size is as below")
X_tfidf =  tfizer.fit_transform(X_cnt).toarray()
print(X_tfidf.shape)

# Splitting into X and Y variable
split the features as :

X: lyrics in TF-IDF representation

Y: Personality type in Binarized MBTI form

In [None]:
personality_type = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) / Sensing (S)", 
                   "FT: Feeling (F) / Thinking (T)", "JP: Judging (J) / Perceiving (P)"  ]

for l in range(len(personality_type)):
    print(personality_type[l])

In [None]:
print("X: 1st lyrics in tf-idf representation\n%s" % X_tfidf[0])

In [None]:
print("For MBTI personality type : %s" % translate_back(list_personality[0,:]))
print("Y : Binarized MBTI 1st row: %s" % list_personality[0,:])

In [None]:
#Training & Evaluating Models
# lyrics in tf-idf representation
X = X_tfidf

In [None]:
# setup parameters for xgboost
param = {}

param['n_estimators'] = 200 #100
param['max_depth'] = 2 #3
param['nthread'] = 8 #1
param['learning_rate'] = 0.2 #0.1

# Individually training each mbti personlity type
for l in range(len(personality_type)):
    Y = list_personality[:,l]

    # split data into train and test sets
    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

    # fit model on training data
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    
    
    #EVALUATION METRICS
    import sklearn.metrics as metrics
        
    # calculate MAE
    error = mean_absolute_error(y_test, predictions)

    #MSE  
    import math
    MSE = mean_squared_error(y_test, predictions)
    RMSE = math.sqrt(MSE)
    #r squared    
    r2 = r2_score(y_test, predictions)
    
    # display
    # Model Recall: what percentage of positive tuples are labelled as such?
    print("Recall:",metrics.classification_report(y_test, predictions))
    print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
    print("%s Mean absolute error : %.2f%%" % (personality_type[l], error))
    print("%s Root Mean Square Error: %.2f%%" % (personality_type[l], RMSE))
    print("%s r-squared : %.2f%%" % (personality_type[l], r2 ))


#prediction with unclean lyrics

In [None]:
#web scraping part -lyrics 

# Log into Genius API with the Authorization Code
client_access_token='yIyA-7gLpCLUtkU7Udq05X452sNQTNddQdcsRaPeVkz2M_xRuYXwW0pjC7sYu3Nq'
LyricsGenius = lyricsgenius.Genius(client_access_token)

# The package got some timeout issue so these two lines are needed. If you don't then there will be error when you scrape
# Source: https://github.com/johnwmillr/LyricsGenius/issues/121
LyricsGenius.timeout = 15  #timeout
LyricsGenius.sleep = 5

# Create an array to store each song's lyric
lyrics_input = []

inputt = ['shy martin - are you happy', 'summer walker - body'] # NEED TO EDIT THIS PART FOR STREAMLIT
# Traverse through the database, get the song's lyrics from title, and do some preprocessing
for i in inputt:
    # get title
    #song_title = ['sza - love galore', 'sza - good days']
    
    # search for song in genius.com
    searched_song = LyricsGenius.search_song(i)
    
    # if we can't find a song's lyrics then skip and append empty string
    if searched_song is None:
        lyrics_arr.append("")
        continue
        
    # get the lyric
    lyric = searched_song.lyrics
    
    # replace the lyrics newline with ". "
    lyric = lyric.replace("\n", ". ")
    
    # remove initial non-lyrics character:
    # Source: https://thispointer.com/remove-string-before-a-specific-character-in-python/
    # lyric = lyric[lyric.index('.') + 1 :]
    
    # append the processed lyric to the array
    lyrics_input.append(lyric)
    
    # remove initial non-lyrics character:
    #remove_words = '|'.join(['Chorus', 'Lyrics', 'Intro', 'Verse','Outro','Post-Chorus:','Pre-Chorus', 'Embed','Bridge'])
    #lyrics_arr = lyrics_arr.str.replace(remove_words, '')
    
lyrics_input

In [None]:
#li = ' '.join([str(x) for x in lyrics_input]) - for streamlit
li = 

md = pd.DataFrame(data={'MBTI': [''], 'lyrics': [li]})
li, dummy  = pre_process_text(md, remove_stop_words=True)
my_X_cnt = cntizer.transform(li)
my_X_tfidf =  tfizer.transform(my_X_cnt).toarray()

In [None]:
# setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2

#XGBoost model for MBTI dataset
result = []
# Individually training each mbti personlity type
for l in range(len(personality_type)):
    print("%s classifier trained" % (personality_type[l]))
    
    Y = list_personality[:,l]

    # split data into train and test sets
    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

    # fit model on training data
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    
    # make predictions for my  data
    y_pred = model.predict(my_X_tfidf)
    result.append(y_pred[0])

In [None]:
print("The result is: ", translate_back(result)) 