### Load words with concreteness score from xls file, train a classifier to classify words as concrete or abstract, save the classifier

In [6]:
import numpy as np
import spacy
from nltk.corpus import wordnet as wn

import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

In [7]:
# TODO tune concreteness threshold
concreteness_df = pd.read_excel("data/brysbaert_concreteness_ratings.xlsx", na_filter=False)
# extract list of words with concreteness >= 3 and < 3
concrete_words = concreteness_df.query('conc_score >= 3')['word'].tolist()  # words with concreteness >= 3
abstract_words = concreteness_df.query('conc_score < 3')['word'].tolist()  # words with concreteness < 3

In [8]:
print(len(concrete_words))
print(concrete_words[:10])
print()
print(len(abstract_words))
print(abstract_words[:10])

18776
['accumulate', 'add', 'aerially', 'ahead', 'aiming', 'airless', 'alternation', 'anaphylactic', 'anatomically', 'annotate']

21178
['eh', 'essentialness', 'although', 'spirituality', 'would', 'spiritually', 'whatsoever', 'conceptualistic', 'conventionalism', 'belief']


In [9]:
classes = ['concrete', 'abstract']
train_set = []
train_set.append(concrete_words)
train_set.append(abstract_words)

In [10]:
from time import sleep

# get word vectors list
X = []
index = 0
for part in train_set:
    for word in part:
        X.append(nlp(word)[0].vector)

In [11]:
# get labels
y = [label for label, part in enumerate(train_set) for _ in part]
classifier = LogisticRegression(C=0.1, class_weight='balanced').fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# save classifier
joblib.dump(classifier, "trained_models/concrete_abstract_classifier.joblib")

['trained_models/concrete_abstract_classifier.joblib']

In [13]:
# load classifier
classifier = joblib.load("trained_models/concrete_abstract_classifier.joblib")

In [14]:
synsets = ['war.n.01', 'fiefdom.n.01', 'bed.n.03', 'return_on_invested_capital.n.01', 'texture.n.02', 'news.n.01',
           'look.n.02']

for synset_str in synsets:
    synset = wn.synset(synset_str)
    synset_name = synset.lemma_names()[0]
    synset_vector = list(nlp(synset_name))[0].vector
    synset_class = classifier.predict([synset_vector])[0]
    # print classification
    print(f'{synset_name} -> {synset_class} - {classes[synset_class]}')

war -> 0 - concrete
fiefdom -> 1 - abstract
bed -> 0 - concrete
return_on_invested_capital -> 1 - abstract
texture -> 0 - concrete
news -> 0 - concrete
look -> 0 - concrete


### Regression version. Predicting concreteness score of a word.

In [18]:
new_y = concreteness_df['conc_score']

<class 'list'>


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, new_y, test_size=0.3, random_state=42)

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.9317894584297259
