In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import pickle
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
df = pd.read_csv('../data/linkfree_combined.csv')

In [10]:
df.head(2)

Unnamed: 0,type,linkfree_combined
0,INFJ,enfp intj moment sportscenter top ten play pra...
1,ENTP,im finding lack post alarming|||sex boring pos...


In [11]:
df = df.dropna()

In [12]:
def tokenize_for_countvec(post):
    return post.replace('|||', ' ')

In [13]:
def two_labels(type_):
    types = {'INTJ':0, 'INTP':0, 'ENTJ':1, 'ENTP':1, 
             'INFJ':0, 'INFP':0, 'ENFJ':1, 'ENFP':1, 
             'ISTJ':0, 'ISFJ':0, 'ESTJ':1, 'ESFJ':1, 
             'ISTP':0, 'ISFP':0, 'ESTP':1, 'ESFP':1}
    
    return types[type_]

In [14]:
X = df.linkfree_combined.apply(tokenize_for_countvec)
y = df.type.apply(two_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=822)

In [15]:
y.value_counts()

0    6675
1    1999
Name: type, dtype: int64

In [16]:
y_train.value_counts()

0    4677
1    1394
Name: type, dtype: int64

In [17]:
vectorizer = CountVectorizer(min_df=2)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [18]:
X_train.shape, X_train_vect.shape

((6071,), (6071, 40948))

In [19]:
X_train_vect_arr = X_train_vect.toarray()
X_test_vect_arr = X_test_vect.toarray()
X_train_vect_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
log_model = LogisticRegression(max_iter=1500, class_weight='balanced')
log_model.fit(X_train_vect_arr, y_train)

LogisticRegression(class_weight='balanced', max_iter=1500)

In [21]:
pickle.dump(log_model, open('models/IE_basic_log_model', 'wb'))

In [22]:
results = log_model.score(X_test_vect_arr, y_test)
print(results)

0.8232808298117557


In [23]:
random_test = ['hello blah blah blah', 'i love noodles', 'noodles are me', 'i think the world is a beautiful place', 'sometimes i like to go shopping', 'the things i do for love', 'man i love that movie', 'sometimes the world is a beautiful place but somtimes i hate it so much', 'wow that waas such a good movie, deadpool was amazing. highly recommended']

In [24]:
Austin_test = ['jeez she def knows how to sound as bitchy as possible','when can you finally never talk to her ever again?','sounds very natural to me','what a cunt','looking forward to that day for you','finally managed to get rid of my slicey driver','can you come over real quick',"need help grabbing mel's bag",'she gonna meet us there','cool cool','hey! yeah i let you know a week or so ago that i was gonna miss today','just a small family trip. be back in class monday',"i'll watch the lecture vid and work on the challanges myself", 'Feel like i would rather work at least 4 days a week just for more monies', 'I see i see. Man going to school while also working gotta be rough', 'Oh you wanna ktv at my place? That works for me', 'Hows you get out so quick?', 'Ight that makes sense. Lets do it at my place thens', 'Bowling probably soon cause bowling is over at 5', 'Haha ight i would love to see your car bro', 'I’ll text you after bowling then. Cina coming to eat pho or naw? Bouta go ask my momma', 'To see if we gots enough', 'After bowling itll be a little bit of costco then home', 'Yeah she said come thru haha', 'On the highway from rivertown costco. On M6 exit 5 just passed', 'Shes down for wtv. She said whatever you recommend', 'Ohhhh shiet yeah im so down', 'Hell yeah dude! Its a date', 'Ohhh yeah lets do that', 'That sounds good too', 'We making a res? Curtis joining too', 'I think 6 should be fine. We just ate lunch haha', 'Oh ight haha im down for red robins beo', 'Hahaha oh yeah baby thats the america i know! Im hype for red robins haha', 'Just finished packing up mostly. Now gonna head to woodland to walk around a bit', 'Thats the plan', 'Ight haha so you out from work already? Lol shiet thats chill', 'Yessir', 'Thanks bro 😭😭😭 dont wanna leave haha', 'Next time we moving to gr ;) buy me a house', 'Ill be your butler']

In [25]:
Shirley_test = ['🥨🥨🥨', 'Prezel heart', 'I can’t spell prezel lol', 'Didnt spend my childhood time with Disney channel', 'I researched her', 'And still no idea who she is', 'I hope you guys stay safe. I read the email about the two new cases on campus😭😭😭', 'wowwwww', 'Its like the Charlie’s chocolate factory invitation', 'So cool', 'ohhhhhh', 'Hahahahahahahahahahahahahahahahaha', 'Wait, I thought Davis is the UWC founder Davis', 'Hes related to New Balance?', 'wait', 'I always think the Davis library is donated by UWC davis', 'New Balance Davis is our alum??', 'Oh wow', 'But I’m not going to boycott Davis Library', 'Lol', 'hahahahahahahaha', 'Can you explain the story for me please', 'We are designing a product that helps analyze people’s personality based on their chatting history', 'And advise how to chat to better the relationship', 'hahahahahahaha', 'And after we have the product, you guys should try', 'lol', 'Just for fun', 'And it’s English', 'To see what mbti of the person you are chatting with', 'You upload the chat history to our prodcut', 'also', 'If you provide his or her phone number, we could scrape their social media account to tell the MBTI as well', 'We only got two weeks', 'I think we are going to have the basic model and then fine tuning it', 'If we got extra time', 'We are gonna fix this problem', 'We got four weeks left', 'But we havnet finished the lecturing', 'Im not enrolled in the fall, so I’m not encluded in the Jterm', 'And we met a networking event', 'We didnt talk too much', 'But im asking if he wanna out for a coffee chat?', 'Like im using this coffee chat as an excuse', 'So we can meet offline', 'I don’t wanna chat online', 'This is the whole story: I met him on a networking event. We’ve talked a little bit but not a lot.', 'And I want to meet him in person', 'Like im intereted in what he does and also him', 'He said that he’s usually in office, and can’t really come out, but we could find a time to call', 'But I don’t want to call him', 'I want to meet him', 'update: I said yes to his call', 'Because he’s actually think I want to network with him', 'My parents want me to get prepared', 'They want me to go to grad school', 'Ohhh so you are not planning to go to grad school right after college', 'My parents want me to go right after', 'But I don’t', 'Want to', 'no', 'I truly don’t think I got the talent in physics', 'And I don’t want to be a researcher in the lab', 'I hate labs', 'That’s the problem', 'I don’t know', 'But my parents think the name is important', 'And also if I want to come back to China to work', 'I have to go to a famous college for grad school', 'Firms in china prefer those have a master degree', 'Im SOOOOOOOO BADDDDD at standardized testing', 'What did you get at the sat?', 'I took me four times to get 1500', 'Happy thanksgiving!!!!', 'Hope to see you guys this coming fall', 'I actually highly recommend the data science class', 'It looks really practical', 'A really nice touch on the resume I will say', 'However, I don’t know how much you guys can learn in one month though', 'But just saying, machine learning isn’t hard and does not require much coding', 'Same thing for deep learning', 'The hardest part is cleaning the data, which requires a lot of coding', 'Like A LOT', 'But im pretty sure you guys will learn machine learning', 'Without machine learning, it is not data science anymore', 'It is', 'But good for your future', 'I kind of want to audit the machine learning in the spring', '😭', 'But just audit?', 'Oh my', 'I follow his youruve', 'And he got like millions of followers']

In [26]:
def preprocess(post_split, get_youtube=False, add_description=False):
    #replace youtube links with youtube title
    #return list of 50 posts
    if get_youtube:
        post_split = replace_youtube(post_split, add_description)
    
    #removes any 'words' that have http:// or https:// in them
    #returns a list of posts if they are not empty after removing the links
    #return list of <= 50 posts
    post_split = remove_links(post_split)
    
    remove_punc = string.punctuation + '►•'
    #remove punc and lower
    for punctuation in remove_punc:
        for i, item in enumerate(post_split):
            post_split[i] = item.replace(punctuation, '').lower()
            
    #remove soft hyphens       
    for i, item in enumerate(post_split):
        post_split[i] = item.replace('\xad', '').lower()
        
    #remove numbers
    for i, item in enumerate(post_split):
        post_split[i] = ''.join(word for word in item if not word.isdigit())
    
    #remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = [word_tokenize(item) for item in post_split]
    post_split = [[word for word in sentence if word not in stop_words] for sentence in word_tokens]
    
    #lemmatize if not empty sentence
    lemmatizer = WordNetLemmatizer()
    post_split_split = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in post_split if sentence]
    
    #combine text into one string where each word is seperated by a space and each sentence separated by |||
    return combine_text(post_split_split)

In [27]:
def remove_links(post_split):
    # split each post into a list of individual words
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [28]:
def combine_text(post_split_split):
    #takes in a list of sentences where each sentence is a list of its words
    #and returns the one string where each word is seperated by a space and each sentence separated by |||
    return '|||'.join([' '.join(sentence) for sentence in post_split_split])

In [33]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [31]:
Austin_test_cleaned = preprocess(Austin_test)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\admin/nltk_data'
    - 'c:\\users\\admin\\.venvs\\lewagon\\nltk_data'
    - 'c:\\users\\admin\\.venvs\\lewagon\\share\\nltk_data'
    - 'c:\\users\\admin\\.venvs\\lewagon\\lib\\nltk_data'
    - 'C:\\Users\\admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
Shirley_test_cleaned = preprocess(Shirley_test)

In [None]:
random_test_cleaned = preprocess(random_test)

In [None]:
Austin_test_cleaned

In [None]:
Austin_test_cleaned = [tokenize_for_countvec(Austin_test_cleaned)]

In [None]:
Shirley_test_cleaned = [tokenize_for_countvec(Shirley_test_cleaned)]

In [None]:
random_test_cleaned = [tokenize_for_countvec(random_test_cleaned)]

In [None]:
Austin_test_cleaned

In [None]:
Austin_test_vect = vectorizer.transform(Austin_test_cleaned)

In [None]:
Shirley_test_vect = vectorizer.transform(Shirley_test_cleaned)

In [None]:
random_test_vect = vectorizer.transform(random_test_cleaned)

In [None]:
log_model.predict(Austin_test_vect)

In [None]:
log_model.predict(random_test_vect)

In [None]:
log_model.predict(Shirley_test_vect)