# Lexicon Research

---
## Preprocessing
---

In [1]:
import string
import re
import os
import joblib

import pandas as pd 
import numpy as np 

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Powerhouse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
art_df = pd.read_csv('C:/Users/Powerhouse/Documents/GitHub/Project-High/project_high/Model/article-database.csv')
art_df.drop([art_df.columns[0], art_df.columns[2], art_df.columns[-1]], axis=1, inplace=True)
art_df = art_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False).reset_index(drop=True)
art_df.columns = ['title', 'text', 'tags']
art_df.head()

Unnamed: 0,title,text,tags
0,The Ultimate Travel Workout and Diet Plans by ...,"INTRODUCING THE BUCKET LIST FAMILY Hi, we are...","['Fitness', 'Travel', 'Diet', 'Exercise', 'Vac..."
1,Building a Responsive Image Gallery,In this post you would learn to build a basic...,"['Design', 'Data', 'Growth', 'CSS', 'Web Design']"
2,When they’re already “forever” for you,Because love that’s “light” can last that lon...,"['Love', 'Relationships', 'Dating', 'Men', 'Wo..."
3,Forking The iPhone,by Jean-Louis Gassée [A delayed Monday Note d...,"['Publishing', 'Apple', 'Apple']"
4,Method swizzling in iOS swift,"Today, I was integrating FCM in my iOS app. I...","['Swift', 'iOS', 'Apple']"


In [14]:
def func(raw_tags):
    raw_split = raw_tags[1:-1].split(',')
    num_tags = len(raw_split)
    tags_clean = []
    tags_clean.append(raw_split[0][1:-1])
    for i in range(1, num_tags):
       tags_clean.append(raw_split[i][2:-1])
    return tags_clean

art_df['tags'] = art_df['tags'].apply(lambda x: func(x))
print(type(art_df.tags[0]))
art_df.head()

<class 'list'>


Unnamed: 0,title,text,tags
0,The Ultimate Travel Workout and Diet Plans by ...,"INTRODUCING THE BUCKET LIST FAMILY Hi, we are...","[Fitness, Travel, Diet, Exercise, Vacation]"
1,Building a Responsive Image Gallery,In this post you would learn to build a basic...,"[Design, Data, Growth, CSS, Web Design]"
2,When they’re already “forever” for you,Because love that’s “light” can last that lon...,"[Love, Relationships, Dating, Men, Women]"
3,Forking The iPhone,by Jean-Louis Gassée [A delayed Monday Note d...,"[Publishing, Apple, Apple]"
4,Method swizzling in iOS swift,"Today, I was integrating FCM in my iOS app. I...","[Swift, iOS, Apple]"


In [15]:
multi_label_transform = MultiLabelBinarizer()
multi_label_transform.fit(art_df['tags'])
y = multi_label_transform.transform(art_df['tags'])

cols = []
cols.append('text')
for i in list(multi_label_transform.classes_):     
     cols.append(i)

prepd_db = pd.DataFrame()

prepd_db.loc[:, 'text'] = art_df.loc[:, 'text']
for i in range(0, y.shape[1]):
         prepd_db.loc[:, i+1] = y[:, i]

prepd_db.columns=cols
prepd_db.head()

Unnamed: 0,text,Unnamed: 2,2016,2016 Election,2017,3D Printing,AI,API,AWS,Abortion,...,Yoga,YouTube,Youth,Zen,eBooks,iOS,iOS App Development,iPad,iPhone,日本語
0,"INTRODUCING THE BUCKET LIST FAMILY Hi, we are...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,In this post you would learn to build a basic...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Because love that’s “light” can last that lon...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,by Jean-Louis Gassée [A delayed Monday Note d...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Today, I was integrating FCM in my iOS app. I...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [16]:
df_topic_list = pd.read_csv('C:/Users/Powerhouse/Documents/GitHub/Project-High/project_high/Data/medium-topics/medium_topics.csv')
df_topic_list.columns = ['tag_name', 'link']

core_topics = []
contem_topics = []
for i in range(0, len(multi_label_transform.classes_)):
    if multi_label_transform.classes_[i] in list(df_topic_list.tag_name):
        core_topics.append(multi_label_transform.classes_[i])
    else:
        contem_topics.append(multi_label_transform.classes_[i])

prepd_db.drop(contem_topics, axis=1, inplace=True)
print(prepd_db.shape)
prepd_db.head()

(17651, 87)


Unnamed: 0,text,Accessibility,Addiction,Art,Artificial Intelligence,Basic Income,Beauty,Blockchain,Books,Business,...,TV,Technology,Transportation,Travel,UX,Venture Capital,Women,Work,World,Writing
0,"INTRODUCING THE BUCKET LIST FAMILY Hi, we are...",0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,In this post you would learn to build a basic...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Because love that’s “light” can last that lon...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,by Jean-Louis Gassée [A delayed Monday Note d...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Today, I was integrating FCM in my iOS app. I...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
sum_tags = []
for i in range(0, prepd_db.shape[0]):
    sum_this_row = 0
    for c in prepd_db.columns[1:]:
        sum_this_row = sum_this_row + int(prepd_db[c][i])
    sum_tags.append(sum_this_row) 

chosen_ft = ['text']
for i in range(1, prepd_db.shape[1]):
    colum = prepd_db[prepd_db.columns[i]]
    if(colum.value_counts()[1]>150):
        chosen_ft.append(prepd_db.columns[i])

In [18]:
new_db = prepd_db
new_db['sum_tags'] = sum_tags
new_db = new_db[new_db['sum_tags'] > 0]
new_db = new_db[new_db['sum_tags'] < 6]
new_db = new_db[chosen_ft]
prepd_db = new_db
prepd_db.shape

(13102, 46)

In [19]:
 
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

def clean_text(text):
    text = re.sub("\'", "", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split()) 
    text = text.lower()
    _t = ""
    for t in text.split():
        _t += lemmatizer.lemmatize(t, pos='a') + " "
    text = _t
    text = remove_stopwords(text)

    return text

new_db = prepd_db
new_db['text'] = new_db['text'].apply(lambda x: clean_text(x))
prepd_db=new_db
prepd_db.head()

Unnamed: 0,text,Art,Artificial Intelligence,Blockchain,Books,Business,Creativity,Cryptocurrency,Culture,Data Science,...,Social Media,Software Engineering,Sports,Technology,Travel,UX,Venture Capital,Women,Work,Writing
0,introducing bucket list family hi bucket list ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,post would learn build basic image gallery fle...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,love light last long past partners frankly pro...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,article quick history gaming war gaming strate...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,therapist hear common complaints teens parents...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
prepd_db.to_csv('model-data.csv')

---
## Text -> Features -> Models
---

In [21]:
#### FOR THE REPORT
# test_ft = prepd_db.columns[1]
# this_db = pd.DataFrame()

# to_1 = prepd_db[prepd_db[test_ft] == 1][['text', test_ft]]
# to_0 = prepd_db[prepd_db[test_ft] != 1][['text', test_ft]]
# _frac = (to_0.shape[0]/to_1.shape[0])

# this_db[['text', test_ft]] = to_1
# this_db = this_db.append(to_0.sample(frac=1/_frac)).sample(frac=1)
# this_db.reset_index().drop(['index'], axis=1)

# tfidf_vectorizer = TfidfVectorizer(max_df=0.6, max_features=600)
# train_tfidf = tfidf_vectorizer.fit_transform(this_db['text'])

# xTrain, xTest, yTrain, yTest = train_test_split(train_tfidf, this_db[test_ft], test_size = 0.2, random_state = 0)

# lr = LogisticRegression(tol=0.0)
# lr.fit(xTrain, yTrain)
# pred_test = lr.predict(xTest)
# accuracy_score(yTest, pred_test)

In [22]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)
train_tfidf = tfidf_vectorizer.fit_transform(prepd_db['text'])

for x in range(1, len(prepd_db.columns)):

    lr = LogisticRegression(tol=0.75)
    lr.fit(train_tfidf, prepd_db.iloc[:, x])

    joblib.dump(lr, 'model_pickle_files/' + str(prepd_db.columns[x]) + '.pkl')
