## Preprocessing Part 1

Functions should take in a dataframe with two columns: ['target'] and ['text'] (in that order), and return the same. Everything in between is up to you.

### Importing and arranging data

In [1]:
pip install -U nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn import preprocessing
import string
import re
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mohammad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mohammad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mohammad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mohammad/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Processed dataset

In [4]:
df1 = pd.read_csv("MBTI 500.csv")
df1

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
106062,stay frustrate world life want take long nap w...,INFP
106063,fizzle around time mention sure mistake thing ...,INFP
106064,schedule modify hey w intp strong wing underst...,INFP
106065,enfj since january busy schedule able spend li...,INFP


#### Unprocessed datasets

In [6]:
df2 = pd.read_csv("twitter_MBTI.csv")

In [7]:
df2.drop(df2.columns[0], axis=1, inplace=True)

In [8]:
df2 = df2.rename(columns={'text': 'text', 'label': 'type'})
df2

Unnamed: 0,text,type
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,@Alshymi Les balles sont réelles et sont tirée...,intj
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj
...,...,...
7806,"@sobsjjun God,,pls take care 😕|||@sobsjjun Hir...",intp
7807,@Ignis_02 wow last time i got intp https://t.c...,intp
7808,@akupilled A 100%|||@akupilled That SOMEONE wi...,entp
7809,If you’re #INTJ this one is for you | What is ...,infj


In [9]:
df3 = pd.read_csv("mbti_1.csv")

In [10]:
df3 = df3[['posts', 'type']]
df3 = df3.rename(columns={'posts': 'text', 'type': 'type'})
df3

Unnamed: 0,text,type
0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,INFJ
1,'I'm finding the lack of me in these posts ver...,ENTP
2,'Good one _____ https://www.youtube.com/wat...,INTP
3,"'Dear INTP, I enjoyed our conversation the o...",INTJ
4,'You're fired.|||That's another silly misconce...,ENTJ
...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
8671,'So...if this thread already exists someplace ...,ENFP
8672,'So many questions when i do these things. I ...,INTP
8673,'I am very conflicted right now when it comes ...,INFP


#### Combing unprocessed datasets

In [21]:
combined_df = pd.concat([df2, df3], axis=0)
combined_df

Unnamed: 0,text,type
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,@Alshymi Les balles sont réelles et sont tirée...,intj
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj
...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP
8671,'So...if this thread already exists someplace ...,ENFP
8672,'So many questions when i do these things. I ...,INTP
8673,'I am very conflicted right now when it comes ...,INFP


### Text Cleaning

#### Remove URLs

In [22]:
def remove_urls(data):
    data = re.sub(r'http\S+|www.\S+', '', data)
    
    return data

In [23]:
combined_df['clean_text'] = combined_df['text'].apply(remove_urls)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...
1,@Hispanthicckk Being you makes you look cute||...,intj,@Hispanthicckk Being you makes you look cute||...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,@Alshymi Les balles sont réelles et sont tirée...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,"I'm like entp but idiotic|||Hey boy, do you wa..."
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,@kaeshurr1 Give it to @ZargarShanif ... He has...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,' just because I always think of cats as Fi do...
8671,'So...if this thread already exists someplace ...,ENFP,'So...if this thread already exists someplace ...
8672,'So many questions when i do these things. I ...,INTP,'So many questions when i do these things. I ...
8673,'I am very conflicted right now when it comes ...,INFP,'I am very conflicted right now when it comes ...


#### remove social media handles

In [24]:
def remove_handles(data):
    data = re.sub(r'@\w+', '', data)
    return data

In [25]:
combined_df['clean_text'] = combined_df['clean_text'].apply(remove_handles)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,"The Pope is infallible, this is a catholic ..."
1,@Hispanthicckk Being you makes you look cute||...,intj,"Being you makes you look cute||| On, because ..."
2,@Alshymi Les balles sont réelles et sont tirée...,intj,Les balles sont réelles et sont tirées très r...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,"I'm like entp but idiotic|||Hey boy, do you wa..."
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,Give it to ... He has Pica since childhood||...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,' just because I always think of cats as Fi do...
8671,'So...if this thread already exists someplace ...,ENFP,'So...if this thread already exists someplace ...
8672,'So many questions when i do these things. I ...,INTP,'So many questions when i do these things. I ...
8673,'I am very conflicted right now when it comes ...,INFP,'I am very conflicted right now when it comes ...


#### remove punctuation

In [26]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
def punctuation(data):
    for punctuation in string.punctuation:
            data = data.replace(punctuation, '')
    return data

In [28]:
combined_df['clean_text'] = combined_df['clean_text'].apply(punctuation)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,The Pope is infallible this is a catholic d...
1,@Hispanthicckk Being you makes you look cute||...,intj,Being you makes you look cute On because then...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,Les balles sont réelles et sont tirées très r...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,Im like entp but idioticHey boy do you want to...
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,Give it to He has Pica since childhood Say ...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,just because I always think of cats as Fi dom...
8671,'So...if this thread already exists someplace ...,ENFP,Soif this thread already exists someplace else...
8672,'So many questions when i do these things. I ...,INTP,So many questions when i do these things I wo...
8673,'I am very conflicted right now when it comes ...,INFP,I am very conflicted right now when it comes t...


#### lowercase

In [29]:
def lower_case(data):
    return data.lower()

In [30]:
combined_df['clean_text'] = combined_df['clean_text'].apply(lower_case)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,the pope is infallible this is a catholic d...
1,@Hispanthicckk Being you makes you look cute||...,intj,being you makes you look cute on because then...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,les balles sont réelles et sont tirées très r...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,im like entp but idiotichey boy do you want to...
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,give it to he has pica since childhood say ...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,just because i always think of cats as fi dom...
8671,'So...if this thread already exists someplace ...,ENFP,soif this thread already exists someplace else...
8672,'So many questions when i do these things. I ...,INTP,so many questions when i do these things i wo...
8673,'I am very conflicted right now when it comes ...,INFP,i am very conflicted right now when it comes t...


#### remove special characters

In [31]:
def remove_special_characters(data):
    data = re.sub(r'[^A-Za-z0-9\s]+', '', data)
    return data

In [32]:
combined_df['clean_text'] = combined_df['clean_text'].apply(remove_special_characters)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,the pope is infallible this is a catholic d...
1,@Hispanthicckk Being you makes you look cute||...,intj,being you makes you look cute on because then...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,les balles sont relles et sont tires trs rapi...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,im like entp but idiotichey boy do you want to...
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,give it to he has pica since childhood say ...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,just because i always think of cats as fi dom...
8671,'So...if this thread already exists someplace ...,ENFP,soif this thread already exists someplace else...
8672,'So many questions when i do these things. I ...,INTP,so many questions when i do these things i wo...
8673,'I am very conflicted right now when it comes ...,INFP,i am very conflicted right now when it comes t...


#### remove white space

In [33]:
def white_space(data):
    return data.strip()

In [34]:
combined_df['clean_text'] = combined_df['clean_text'].apply(white_space)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,the pope is infallible this is a catholic dogm...
1,@Hispanthicckk Being you makes you look cute||...,intj,being you makes you look cute on because then ...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,les balles sont relles et sont tires trs rapid...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,im like entp but idiotichey boy do you want to...
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,give it to he has pica since childhood say q...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,just because i always think of cats as fi doms...
8671,'So...if this thread already exists someplace ...,ENFP,soif this thread already exists someplace else...
8672,'So many questions when i do these things. I ...,INTP,so many questions when i do these things i wo...
8673,'I am very conflicted right now when it comes ...,INFP,i am very conflicted right now when it comes t...


### Tokenizing

In [35]:
def tokenize(data):
    data = word_tokenize(data)
    return data

In [36]:
combined_df['clean_text'] = combined_df['clean_text'].apply(tokenize)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,"[the, pope, is, infallible, this, is, a, catho..."
1,@Hispanthicckk Being you makes you look cute||...,intj,"[being, you, makes, you, look, cute, on, becau..."
2,@Alshymi Les balles sont réelles et sont tirée...,intj,"[les, balles, sont, relles, et, sont, tires, t..."
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,"[im, like, entp, but, idiotichey, boy, do, you..."
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,"[give, it, to, he, has, pica, since, childhood..."
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,"[just, because, i, always, think, of, cats, as..."
8671,'So...if this thread already exists someplace ...,ENFP,"[soif, this, thread, already, exists, someplac..."
8672,'So many questions when i do these things. I ...,INTP,"[so, many, questions, when, i, do, these, thin..."
8673,'I am very conflicted right now when it comes ...,INFP,"[i, am, very, conflicted, right, now, when, it..."


### Stopword Removal

In [37]:
stop_words = set(stopwords.words('english'))

In [38]:
def stopwords(data):
    data = [w for w in data if w not in stop_words] 
    return data

In [39]:
combined_df['clean_text'] = combined_df['clean_text'].apply(stopwords)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,"[pope, infallible, catholic, dogma, doesnt, me..."
1,@Hispanthicckk Being you makes you look cute||...,intj,"[makes, look, cute, fun, peeling, bored, less,..."
2,@Alshymi Les balles sont réelles et sont tirée...,intj,"[les, balles, sont, relles, et, sont, tires, t..."
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,"[im, like, entp, idiotichey, boy, want, watch,..."
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,"[give, pica, since, childhood, say, qubool, ha..."
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,"[always, think, cats, fi, doms, reason, websit..."
8671,'So...if this thread already exists someplace ...,ENFP,"[soif, thread, already, exists, someplace, els..."
8672,'So many questions when i do these things. I ...,INTP,"[many, questions, things, would, take, purple,..."
8673,'I am very conflicted right now when it comes ...,INFP,"[conflicted, right, comes, wanting, children, ..."


### Text Lemmatization

In [40]:
def lemmatize(data):

    # Lemmatizing the verbs
    data = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in data]

    # Lemmatizing the nouns
    data = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in data]
    
    return ' '.join(data)

In [41]:
combined_df['clean_text'] = combined_df['clean_text'].apply(lemmatize)
combined_df

Unnamed: 0,text,type,clean_text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,pope infallible catholic dogma doesnt mean per...
1,@Hispanthicckk Being you makes you look cute||...,intj,make look cute fun peel bore le sweetie id suc...
2,@Alshymi Les balles sont réelles et sont tirée...,intj,le ball sont relles et sont tire trs rapidemen...
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,im like entp idiotichey boy want watch twitch ...
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,give pica since childhood say qubool hai dm ge...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,ISFP,always think cat fi doms reason website become...
8671,'So...if this thread already exists someplace ...,ENFP,soif thread already exist someplace else heck ...
8672,'So many questions when i do these things. I ...,INTP,many question thing would take purple pill pic...
8673,'I am very conflicted right now when it comes ...,INFP,conflict right come want child honestly matern...


### Testing different embedding models

#### tfidf-multinominalNB

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import set_config; set_config("diagram")

In [43]:
X = combined_df['clean_text']
y = combined_df['type']

In [44]:
# Create Pipeline
pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(), 
    MultinomialNB()
)

# Set parameters to search
parameters = {
    "tfidfvectorizer__ngram_range": [(1,1), (1,2), (2,2)], 
    "tfidfvectorizer__max_df": [0.5, 0.75, 1.0],
    "multinomialnb__alpha": [0.01, 0.1, 1.0, 10.0]
}
    
# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline_naive_bayes, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [49]:
best_estimator = grid_search.best_estimator_

In [50]:
best_estimator

In [51]:
grid_search.best_score_

0.16699999999999998

#### tfidf-LDA

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(X)
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(), 
    columns = vectorizer.get_feature_names_out()
)

vectorized_documents

Unnamed: 0,00,000,0000,000000,000003,00001th,0004,000i,001,002,...,zyes,zzsdhsaaahdhdh,zzz,zzzamnnn,zzzare,zzzi,zzzz,zzzzno,zzzzzzvzzvvsbsits,zzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [74]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(vectorized_documents, y, test_size=0.2, random_state=42)


In [73]:
# Instantiate and fit the classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = classifier.predict(X_test)

# Calculate the accuracy score
accuracy = balanced_accuracy_score(y_test, y_pred)
print("Accuracy score:", accuracy)

Accuracy score: 0.06793650793650793


In [68]:
from sklearn.metrics import f1_score

In [69]:
# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].