### Machine Learning for Econonomics Journal Abstracts

In [86]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [87]:
topJourns_df = pd.read_csv("raw_data_econ/topRanks_cleaned.csv", encoding = "'iso-8859-1'")
hiJourns_df = pd.read_csv("raw_data_econ/hiRanks_cleaned.csv", encoding = "'iso-8859-1'")
notHiJourns_df = pd.read_csv("raw_data_econ/notHiRanks_cleaned.csv", encoding = "'iso-8859-1'")
notHiJourns2_df = pd.read_csv("raw_data_econ/notHiRanks2_cleaned.csv", encoding = "'iso-8859-1'")

frames = [topJourns_df, hiJourns_df, notHiJourns_df, notHiJourns2_df]
combined_df = pd.concat(frames)
combined_df["abstract"].shape

(8229,)

In [88]:
# split data into test & train
X = combined_df["abstract"]
y = combined_df["top_journal"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# transform X and y to lists for processing
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [89]:
# Fit to logistic regression function
classifier = LogisticRegression()

#word to vector
'''tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))'''
hash_vectorizer = HashingVectorizer(analyzer='word', ngram_range=(1, 2))
X_train=hash_vectorizer.fit_transform(X_train)
X_test=hash_vectorizer.fit_transform(X_test)

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
# training and testing data score
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8799222168206126
Testing Data Score: 0.7089407191448007


In [91]:
# Making predictions
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Actual,Prediction
0,1,1
1,1,1
2,0,0
3,0,1
4,0,0
5,0,1
6,1,1
7,0,1
8,1,1
9,0,0


In [92]:
from sklearn.metrics import classification_report
target_names = ["Not top Journal", "Top Journal"]
report = classification_report(y_test, predictions, target_names=target_names)
print(report)

                 precision    recall  f1-score   support

Not top Journal       0.72      0.70      0.71      1034
    Top Journal       0.70      0.72      0.71      1024

    avg / total       0.71      0.71      0.71      2058



In [93]:
# Saving the model for use elsewhere
import pickle
pickle.dump(classifier,open('logReg_model', 'wb'))


In [94]:
# Test: loading in the pickled model
clf2 = pickle.load(open('logReg_model', 'rb'))

In [95]:
# Test: predict if top journal based on input string 
string = "This paper provides simple , yet robust framework evaluate time profile benefits paid unemployment spell . We derive sufficient-statistics formulae capturing marginal insurance value incentive costs unemployment benefits paid different times spell . Our approach allows us revisit separate arguments inclining declining profiles put forward theoretical literature identify welfare-improving changes benefit profile account relevant arguments jointly . For empirical implementation , use administrative data unemployment , linked data consumption , income , wealth Sweden . First , exploit duration-dependent kinks replacement rate find , anything , moral hazard cost benefits larger paid earlier spell . Second , find drop consumption affecting insurance value benefits large start spell , increases throughout spell . In trading insurance incentives , analysis suggests flat benefit profile Sweden generous overall . However , insurance incentives side , find evidence support introduction declining tilt profile ."
test = " "
def model_predict(s):
    string = []
    string.append(s)
    test = hash_vectorizer.fit_transform(string)
    result = clf2.predict(test)
    return result[0]
model_predict(test)

0

In [None]:
'''# Test area for neural network
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(units=512, activation='relu', input_dim=1048576))
model.add(Dense(units=512, activation='relu'))
model.add(Dense(units=2, activation='softmax'))'''

In [None]:
'''from keras.utils import to_categorical
y_binary = to_categorical(y_train)

model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(X_train,
    y_binary,
    epochs=5,
    shuffle=True,
    verbose=2, validation_split=0.1
)'''

Train on 5553 samples, validate on 618 samples
Epoch 1/5
