## Preprocessing Part 1

Functions should take in a dataframe with two columns: ['target'] and ['text'] (in that order), and return the same. Everything in between is up to you.

### Importing and arranging data

In [1]:
pip install -U nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from sklearn import preprocessing
import string
import re
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

#### Processed dataset

In [None]:
df1 = pd.read_csv("MBTI 500.csv")
df1

#### Unprocessed datasets

In [None]:
df2 = pd.read_csv("twitter_MBTI.csv")

In [None]:
df2.drop(df2.columns[0], axis=1, inplace=True)

In [None]:
df2 = df2.rename(columns={'text': 'posts', 'label': 'type'})
df2

In [None]:
df3 = pd.read_csv("mbti_1.csv")

In [None]:
df3 = df3[['posts', 'type']]
df3

#### Combing unprocessed datasets

In [None]:
combined_df = pd.concat([df2, df3], axis=0)
combined_df

### Text Cleaning

#### Remove URLs

In [None]:
def remove_urls(data):
    data = re.sub(r'http\S+|www.\S+', '', data)
    
    return data

In [None]:
combined_df['clean_posts'] = combined_df['posts'].apply(remove_urls)
combined_df

#### remove social media handles

In [None]:
def remove_handles(data):
    data = re.sub(r'@\w+', '', data)
    return data

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(remove_handles)
combined_df

#### remove punctuation

In [None]:
string.punctuation

In [None]:
def punctuation(data):
    for punctuation in string.punctuation:
            data = data.replace(punctuation, '')
    return data

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(punctuation)
combined_df

#### lowercase

In [None]:
def lower_case(data):
    return data.lower()

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(lower_case)
combined_df

#### remove special characters

In [None]:
def remove_special_characters(data):
    data = re.sub(r'[^A-Za-z0-9\s]+', '', data)
    return data

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(remove_special_characters)
combined_df

#### remove white space

In [None]:
def white_space(data):
    return data.strip()

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(white_space)
combined_df

### Tokenizing

In [None]:
def tokenize(data):
    data = word_tokenize(data)
    return data

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(tokenize)
combined_df

### Stopword Removal

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def stopwords(data):
    data = [w for w in data if w not in stop_words] 
    return data

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(stopwords)
combined_df

### Text Lemmatization

In [None]:
def lemmatize(data):

    # Lemmatizing the verbs
    data = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in data]

    # Lemmatizing the nouns
    data = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in data]
    
    return ' '.join(data)

In [None]:
combined_df['clean_posts'] = combined_df['clean_posts'].apply(lemmatize)
combined_df

### Testing different embedding models

#### tfidf-multinominalNB

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import set_config; set_config("diagram")

In [None]:
X = combined_df['clean_posts']
y = combined_df['type']